Update Kokkos library to v2.04.00

2017-08-22 13:42:02 -06:00
parent b11fe2eddb
commit 090c792d90
112 changed files with 10885 additions and 154 deletions
--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@ -9,30 +9,6 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
 ASSERT_DEFINED(${PROJECT_NAME}_ENABLE_CXX11)
 ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUDA)

-# Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA governs whether Kokkos allows
-# use of lambdas at the outer level of parallel dispatch (that is, as
-# the argument to an outer parallel_for, parallel_reduce, or
-# parallel_scan).  This works with non-CUDA execution spaces if C++11
-# is enabled.  It does not currently work with public releases of
-# CUDA.  If that changes, please change the default here to ON if CUDA
-# and C++11 are ON.
-IF (${PROJECT_NAME}_ENABLE_CXX11)
-  IF (${PACKAGE_NAME}_ENABLE_CUDA)
-    SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
-  ELSE ()
-    SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT ON)
-  ENDIF ()
-ELSE ()
-  SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
-ENDIF ()
-
-TRIBITS_ADD_OPTION_AND_DEFINE(
-  Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA
-  KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
-  "Whether Kokkos allows use of lambdas at the outer level of parallel dispatch (that is, as the argument to an outer parallel_for, parallel_reduce, or parallel_scan).  This requires C++11.  It also does not currently work with public releases of CUDA.  As a result, even if C++11 is enabled, this will be OFF by default if CUDA is enabled.  If this option is ON, the macro KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA will be defined.  For compatibility with Kokkos' Makefile build system, it is also possible to define that macro on the command line."
-  ${Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT}
-  )
-
 TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)

 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
--- a/lib/kokkos/core/src/Kokkos_Array.hpp
+++ b/lib/kokkos/core/src/Kokkos_Array.hpp
@ -152,10 +152,10 @@ public:
  KOKKOS_INLINE_FUNCTION pointer       data()       { return pointer(0) ; }
  KOKKOS_INLINE_FUNCTION const_pointer data() const { return const_pointer(0); }

-  ~Array() = default ;
-  Array() = default ;
-  Array( const Array & ) = default ;
-  Array & operator = ( const Array & ) = default ;
+  KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
+  KOKKOS_FUNCTION_DEFAULTED Array() = default ;
+  KOKKOS_FUNCTION_DEFAULTED Array( const Array & ) = default ;
+  KOKKOS_FUNCTION_DEFAULTED Array & operator = ( const Array & ) = default ;

  // Some supported compilers are not sufficiently C++11 compliant
  // for default move constructor and move assignment operator.
@ -209,7 +209,7 @@ public:
  KOKKOS_INLINE_FUNCTION pointer       data()       { return m_elem ; }
  KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }

-  ~Array() = default ;
+  KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
  Array() = delete ;
  Array( const Array & rhs ) = delete ;

@ -278,7 +278,7 @@ public:
  KOKKOS_INLINE_FUNCTION pointer       data()       { return m_elem ; }
  KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }

-  ~Array() = default ;
+  KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
  Array()  = delete ;
  Array( const Array & ) = delete ;

--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@ -80,6 +80,11 @@
 // Compiling NVIDIA device code, must use Cuda atomics:

 #define KOKKOS_ENABLE_CUDA_ATOMICS
+
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU)
+
+#define KOKKOS_ENABLE_ROCM_ATOMICS
+
 #endif

 #if ! defined( KOKKOS_ENABLE_GNU_ATOMICS ) && \
@ -154,6 +159,19 @@ const char * atomic_query_version()

 } // namespace Kokkos

+#if defined( KOKKOS_ENABLE_ROCM )
+#include <ROCm/Kokkos_ROCm_Atomic.hpp>
+namespace Kokkos {
+namespace Impl {
+extern KOKKOS_INLINE_FUNCTION
+bool lock_address_rocm_space(void* ptr);
+
+extern KOKKOS_INLINE_FUNCTION
+void unlock_address_rocm_space(void* ptr);
+}
+}
+#endif
+
 #ifdef _WIN32
 #include "impl/Kokkos_Atomic_Windows.hpp"
 #else
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@ -107,6 +107,11 @@ public:
    re_ (val), im_ (0.0)
  {}

+  // BUG HCC WORKAROUND
+  KOKKOS_INLINE_FUNCTION complex( const RealType& re, const RealType& im):
+    re_ (re), im_ (im)
+  {}
+ 
  //! Constructor that takes the real and imaginary parts.
  template<class RealType1, class RealType2>
  KOKKOS_INLINE_FUNCTION complex (const RealType1& re, const RealType2& im) :
@ -227,6 +232,16 @@ public:
    return re_;
  }

+  //! Set the imaginary part of this complex number.
+  KOKKOS_INLINE_FUNCTION void imag (RealType v) {
+    im_ = v;
+  }
+
+  //! Set the real part of this complex number.
+  KOKKOS_INLINE_FUNCTION void real (RealType v) {
+    re_ = v;
+  }
+
  KOKKOS_INLINE_FUNCTION
  complex<RealType>& operator += (const complex<RealType>& src) {
    re_ += src.re_;
@ -299,7 +314,7 @@ public:
    // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
    // If the real part is +/-Inf and the imaginary part is -/+Inf,
    // this won't change the result.
-    const RealType s = ::fabs (y.real ()) + ::fabs (y.imag ());
+    const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());

    // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
    // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
@ -537,7 +552,7 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
  // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
  // If the real part is +/-Inf and the imaginary part is -/+Inf,
  // this won't change the result.
-  const RealType s = ::fabs (real (y)) + ::fabs (imag (y));
+  const RealType s = std::fabs (real (y)) + std::fabs (imag (y));

  // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
  // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -74,6 +74,10 @@
 #include <Kokkos_Cuda.hpp>
 #endif

+#if defined( KOKKOS_ENABLE_ROCM )
+#include <Kokkos_ROCm.hpp>
+#endif
+
 #include <Kokkos_Pair.hpp>
 #include <Kokkos_MemoryPool.hpp>
 #include <Kokkos_Array.hpp>
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@ -122,6 +122,13 @@ class CudaHostPinnedSpace;  ///< Memory space on Host accessible to Cuda GPU
 class Cuda;                 ///< Execution space for Cuda GPU
 #endif

+#if defined( KOKKOS_ENABLE_ROCM )
+namespace Experimental {
+class ROCmSpace ;            ///< Memory space on ROCm GPU
+class ROCm ;                 ///< Execution space for ROCm GPU
+}
+#endif
+
 template<class ExecutionSpace, class MemorySpace>
 struct Device;

@ -140,6 +147,8 @@ namespace Kokkos {
  typedef Cuda DefaultExecutionSpace;
 #elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
  typedef Experimental::OpenMPTarget DefaultExecutionSpace ;
+#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM )
+  typedef Experimental::ROCm DefaultExecutionSpace ;
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
  typedef OpenMP DefaultExecutionSpace;
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
@ -185,6 +194,8 @@ namespace Impl {

 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined( KOKKOS_ENABLE_CUDA )
 typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace;
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU )
+typedef Kokkos::HostSpace  ActiveExecutionMemorySpace ;
 #elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
 typedef Kokkos::HostSpace  ActiveExecutionMemorySpace;
 #else
--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@ -98,18 +98,18 @@ public:
  typedef View<size_type* , array_layout, device_type> row_map_type;
  typedef View<DataType*  , array_layout, device_type> entries_type;

-  entries_type entries;
  row_map_type row_map;
+  entries_type entries;

  //! Construct an empty view.
-  Crs () : entries(), row_map() {}
+  Crs() : row_map(), entries() {}

  //! Copy constructor (shallow copy).
-  Crs (const Crs& rhs) : entries (rhs.entries), row_map (rhs.row_map)
+  Crs(const Crs& rhs) : row_map(rhs.row_map), entries(rhs.entries)
  {}

  template<class EntriesType, class RowMapType>
-  Crs (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
+  Crs(const RowMapType& row_map_, const EntriesType& entries_) : row_map(row_map_), entries(entries_)
  {}

  /** \brief  Assign to a view of the rhs array.
@ -117,8 +117,8 @@ public:
   *          then allocated memory is deallocated.
   */
  Crs& operator= (const Crs& rhs) {
-    entries = rhs.entries;
    row_map = rhs.row_map;
+    entries = rhs.entries;
    return *this;
  }

@ -151,7 +151,7 @@ void get_crs_transpose_counts(

 template< class OutCounts,
          class InCrs>
-void get_crs_row_map_from_counts(
+typename OutCounts::value_type get_crs_row_map_from_counts(
    OutCounts& out,
    InCrs const& in,
    std::string const& name = "row_map");
@ -204,18 +204,20 @@ class CrsRowMapFromCounts {
  using execution_space = typename InCounts::execution_space;
  using value_type = typename OutRowMap::value_type;
  using index_type = typename InCounts::size_type;
+  using last_value_type = Kokkos::View<value_type, execution_space>;
 private:
-  InCounts in;
-  OutRowMap out;
+  InCounts m_in;
+  OutRowMap m_out;
+  last_value_type m_last_value;
 public:
  KOKKOS_INLINE_FUNCTION
  void operator()(index_type i, value_type& update, bool final_pass) const {
-    update += in(i);
-    if (final_pass) {
-      out(i + 1) = update;
-      if (i == 0) {
-        out(0) = 0;
-      }
+    if (i < m_in.size()) {
+      update += m_in(i);
+      if (final_pass) m_out(i + 1) = update;
+    } else if (final_pass) {
+      m_out(0) = 0;
+      m_last_value() = update;
    }
  }
  KOKKOS_INLINE_FUNCTION
@ -226,12 +228,16 @@ class CrsRowMapFromCounts {
  }
  using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>;
  CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out):
-    in(arg_in),out(arg_out) {
+    m_in(arg_in), m_out(arg_out), m_last_value("last_value") {
+  }
+  value_type execute() {
    using policy_type = RangePolicy<index_type, execution_space>;
    using closure_type = Kokkos::Impl::ParallelScan<self_type, policy_type>;
-    closure_type closure(*this, policy_type(0, in.size()));
+    closure_type closure(*this, policy_type(0, m_in.size() + 1));
    closure.execute();
-    execution_space::fence();
+    auto last_value = Kokkos::create_mirror_view(m_last_value);
+    Kokkos::deep_copy(last_value, m_last_value);
+    return last_value();
  }
 };

@ -297,13 +303,14 @@ void get_crs_transpose_counts(

 template< class OutRowMap,
          class InCounts>
-void get_crs_row_map_from_counts(
+typename OutRowMap::value_type get_crs_row_map_from_counts(
    OutRowMap& out,
    InCounts const& in,
    std::string const& name) {
  out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
  Kokkos::Impl::Experimental::
    CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
+  return functor.execute();
 }

 template< class DataType,
@ -328,6 +335,65 @@ void transpose_crs(
    FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
 }

+template< class CrsType,
+          class Functor>
+struct CountAndFill {
+  using data_type = typename CrsType::size_type;
+  using size_type = typename CrsType::size_type;
+  using row_map_type = typename CrsType::row_map_type;
+  using entries_type = typename CrsType::entries_type;
+  using counts_type = row_map_type;
+  CrsType m_crs;
+  Functor m_functor;
+  counts_type m_counts;
+  struct Count {};
+  KOKKOS_INLINE_FUNCTION void operator()(Count, size_type i) const {
+    m_counts(i) = m_functor(i, nullptr);
+  }
+  struct Fill {};
+  KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
+    auto j = m_crs.row_map(i);
+    data_type* fill = &(m_crs.entries(j));
+    m_functor(i, fill);
+  }
+  using self_type = CountAndFill<CrsType, Functor>;
+  CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
+    m_crs(crs),
+    m_functor(f)
+  {
+    using execution_space = typename CrsType::execution_space;
+    m_counts = counts_type("counts", nrows);
+    {
+    using count_policy_type = RangePolicy<size_type, execution_space, Count>;
+    using count_closure_type =
+      Kokkos::Impl::ParallelFor<self_type, count_policy_type>;
+    const count_closure_type closure(*this, count_policy_type(0, nrows));
+    closure.execute();
+    }
+    auto nentries = Kokkos::Experimental::
+      get_crs_row_map_from_counts(m_crs.row_map, m_counts);
+    m_counts = counts_type();
+    m_crs.entries = entries_type("entries", nentries);
+    {
+    using fill_policy_type = RangePolicy<size_type, execution_space, Fill>;
+    using fill_closure_type =
+      Kokkos::Impl::ParallelFor<self_type, fill_policy_type>;
+    const fill_closure_type closure(*this, fill_policy_type(0, nrows));
+    closure.execute();
+    }
+    crs = m_crs;
+  }
+};
+
+template< class CrsType,
+          class Functor>
+void count_and_fill_crs(
+    CrsType& crs,
+    typename CrsType::size_type nrows,
+    Functor const& f) {
+  Kokkos::Experimental::CountAndFill<CrsType, Functor>(crs, nrows, f);
+}
+
 }} // namespace Kokkos::Experimental

 #endif /* #define KOKKOS_CRS_HPP */
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -96,6 +96,14 @@

 //----------------------------------------------------------------------------

+#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_THREADS) || \
+    defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_QTHREADS) || \
+    defined(KOKKOS_ENABLE_ROCM) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+  #define KOKKOS_INTERNAL_ENABLE_NON_CUDA_BACKEND
+#endif
+
+#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+
 #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
  // Compiling with a CUDA compiler.
  //
@ -133,6 +141,9 @@

    #if ( CUDA_VERSION < 8000 ) && defined( __NVCC__ )
      #define KOKKOS_LAMBDA [=]__device__
+      #if defined( KOKKOS_INTERNAL_ENABLE_NON_CUDA_BACKEND )
+        #undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+      #endif
    #else
      #define KOKKOS_LAMBDA [=]__host__ __device__

@ -141,16 +152,13 @@
      #endif
    #endif

-    #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
-  #endif
-#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
-
-#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-   // Cuda version 8.0 still needs the functor wrapper
-   #if /* ( CUDA_VERSION < 8000 ) && */  defined( __NVCC__ )
+    #if defined( __NVCC__ )
      #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
-   #endif
-#endif
+    #endif
+  #else // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+    #undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+  #endif // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )

 //----------------------------------------------------------------------------
 // Language info: C++, CUDA, OPENMP
@ -161,8 +169,20 @@
  #define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
  #define KOKKOS_INLINE_FUNCTION       __device__  __host__  inline
  #define KOKKOS_FUNCTION              __device__  __host__
+  #ifdef KOKKOS_COMPILER_CLANG
+  #define KOKKOS_FUNCTION_DEFAULTED KOKKOS_FUNCTION
+  #endif
 #endif // #if defined( __CUDA_ARCH__ )

+#if defined( KOKKOS_ENABLE_ROCM ) && defined( __HCC__ )
+
+  #define KOKKOS_FORCEINLINE_FUNCTION  __attribute__((amp,cpu)) inline
+  #define KOKKOS_INLINE_FUNCTION       __attribute__((amp,cpu)) inline
+  #define KOKKOS_FUNCTION              __attribute__((amp,cpu))
+  #define KOKKOS_LAMBDA                [=] __attribute__((amp,cpu))
+  #define KOKKOS_FUNCTION_DEFAULTED    KOKKOS_FUNCTION
+#endif
+
 #if defined( _OPENMP )
  //  Compiling with OpenMP.
  //  The value of _OPENMP is an integer value YYYYMM
@ -179,15 +199,6 @@
  // Host code is compiled again with another compiler.
  // Device code is compile to 'ptx'.
  #define KOKKOS_COMPILER_NVCC __NVCC__
-#else
-  #if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-    #if !defined( KOKKOS_ENABLE_CUDA ) // Compiling with clang for Cuda does not work with LAMBDAs either
-      // CUDA (including version 6.5) does not support giving lambdas as
-      // arguments to global functions. Thus its not currently possible
-      // to dispatch lambdas from the host.
-      #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
-    #endif
-  #endif
 #endif // #if defined( __NVCC__ )

 #if !defined( KOKKOS_LAMBDA )
@ -321,6 +332,10 @@
  //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
  //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
  //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
+
+  #if ! defined( KOKKOS_ENABLE_ASM )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
 #endif

 //----------------------------------------------------------------------------
@ -397,6 +412,10 @@
  #define KOKKOS_FUNCTION /**/
 #endif

+#if !defined( KOKKOS_FUNCTION_DEFAULTED )
+  #define KOKKOS_FUNCTION_DEFAULTED /**/
+#endif
+
 //----------------------------------------------------------------------------
 // Define empty macro for restrict if necessary:

@ -424,6 +443,7 @@
 // There is zero or one default execution space specified.

 #if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM ) ? 1 : 0 ) + \
          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET ) ? 1 : 0 ) + \
          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
@ -435,6 +455,7 @@
 // If default is not specified then chose from enabled execution spaces.
 // Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
 #if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM )
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
@ -442,6 +463,8 @@
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
 #elif defined( KOKKOS_ENABLE_CUDA )
  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+#elif defined( KOKKOS_ENABLE_ROCM )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM
 #elif defined( KOKKOS_ENABLE_OPENMPTARGET )
  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET
 #elif defined( KOKKOS_ENABLE_OPENMP )
@ -459,6 +482,8 @@

 #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA )
  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+#elif   defined( __HCC__ ) && defined( __HCC_ACCELERATOR__ ) && defined( KOKKOS_ENABLE_ROCM )
+  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU
 #else
  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
 #endif
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@ -233,12 +233,24 @@ public:

  //--------------------------------------------------------------------------

-  MemoryPool() = default ;
  MemoryPool( MemoryPool && ) = default ;
  MemoryPool( const MemoryPool & ) = default ;
  MemoryPool & operator = ( MemoryPool && ) = default ;
  MemoryPool & operator = ( const MemoryPool & ) = default ;

+  MemoryPool()
+    : m_tracker()
+    , m_sb_state_array(0)
+    , m_sb_state_size(0)
+    , m_sb_size_lg2(0)
+    , m_max_block_size_lg2(0)
+    , m_min_block_size_lg2(0)
+    , m_sb_count(0)
+    , m_hint_offset(0)
+    , m_data_offset(0)
+    , m_unused_padding(0)
+    {}
+
  /**\brief  Allocate a memory pool from 'memspace'.
   *
   *  The memory pool will have at least 'min_total_alloc_size' bytes
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@ -1016,7 +1016,7 @@ parallel_reduce( std::string const  & arg_label

  //------------------------------

-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::beginParallelReduce(arg_label, 0, &kpID);
@ -1042,7 +1042,7 @@ parallel_reduce( std::string const  & arg_label

  //------------------------------

-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::endParallelReduce(kpID);
  }
--- a/lib/kokkos/core/src/Kokkos_ROCm.hpp
+++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp
@ -0,0 +1,220 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ROCM_HPP
+#define KOKKOS_ROCM_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ENABLE_ROCM )
+#include <ROCm/hc_math_std.hpp>
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ROCmSpace.hpp>
+#include <ROCm/Kokkos_ROCm_Exec.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#include <hc.hpp>
+#include <hc_am.hpp>
+#include <amp_math.h>
+
+#if defined( __HCC_ACCELERATOR__ )
+
+using namespace ::Concurrency::precise_math ;
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class ROCmExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+/// \class ROCm
+/// \brief Kokkos device for multicore processors in the host memory space.
+class ROCm {
+public:
+  //------------------------------------
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef ROCm                  execution_space ;
+  typedef ROCmSpace             memory_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef LayoutLeft            array_layout ;
+  typedef HostSpace::size_type  size_type ;
+
+  typedef ScratchMemorySpace< ROCm > scratch_memory_space ;
+
+  ~ROCm() {}
+  ROCm();
+//  explicit ROCm( const int instance_id );
+
+  ROCm( ROCm && ) = default ;
+  ROCm( const ROCm & ) = default ;
+  ROCm & operator = ( ROCm && ) = default ;
+  ROCm & operator = ( const ROCm & ) = default ;
+
+
+  //@}
+  //------------------------------------
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#if defined( __HCC_ACCELERATOR__ )
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  /** \brief  Set the device in a "sleep" state. */
+  static bool sleep() ;
+
+  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
+  static bool wake() ;
+
+  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
+  static void fence() ;
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  /// \brief Free any resources being consumed by the device.
+  static void finalize() ;
+
+  /** \brief  Initialize the device.
+   *
+   */
+  struct SelectDevice {
+    int rocm_device_id ;
+    SelectDevice() : rocm_device_id(1) {}
+    explicit SelectDevice( int id ) : rocm_device_id( id+1 ) {}
+  };
+
+  int          rocm_device() const { return m_device ; }
+  bool         isAPU();
+  bool         isAPU(int device);
+
+  static void initialize( const SelectDevice = SelectDevice());
+
+  static int is_initialized();
+
+//  static size_type device_arch();
+
+//  static size_type detect_device_count();
+
+
+  static int concurrency() ;
+  static const char* name();
+private:
+  int          m_device ;
+
+};
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct MemorySpaceAccess
+  < Kokkos::Experimental::ROCmSpace
+  , Kokkos::Experimental::ROCm::scratch_memory_space
+  >
+{
+  enum { assignable = false };
+  enum { accessible = true };
+  enum { deepcopy   = false };
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Experimental::ROCm::memory_space
+  , Kokkos::Experimental::ROCm::scratch_memory_space
+  >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::HostSpace
+  , Kokkos::Experimental::ROCm::scratch_memory_space
+  >
+{
+  enum { value = false };
+  inline static void verify( void ) { Experimental::ROCmSpace::access_error(); }
+  inline static void verify( const void * p ) { Experimental::ROCmSpace::access_error(p); }
+};
+} // namespace Experimental
+} // namespace Kokkos
+
+
+#include <ROCm/Kokkos_ROCm_Parallel.hpp>
+#include <ROCm/Kokkos_ROCm_Task.hpp>
+
+#endif
+#endif
+
+
--- a/lib/kokkos/core/src/Kokkos_ROCmSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ROCmSpace.hpp
@ -0,0 +1,622 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ROCMSPACE_HPP
+#define KOKKOS_ROCMSPACE_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ENABLE_ROCM )
+
+#include <iosfwd>
+#include <typeinfo>
+#include <string>
+
+#include <Kokkos_HostSpace.hpp>
+
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+/** \brief  ROCm on-device memory management */
+
+class ROCmSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef ROCmSpace             memory_space ;
+  typedef Kokkos::Experimental::ROCm          execution_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef unsigned int          size_type ;
+
+  /*--------------------------------*/
+
+  ROCmSpace();
+  ROCmSpace( ROCmSpace && rhs ) = default ;
+  ROCmSpace( const ROCmSpace & rhs ) = default ;
+  ROCmSpace & operator = ( ROCmSpace && rhs ) = default ;
+  ROCmSpace & operator = ( const ROCmSpace & rhs ) = default ;
+  ~ROCmSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the rocm space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the rocm space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return m_name; };
+
+  /*--------------------------------*/
+  /** \brief  Error reporting for HostSpace attempt to access ROCmSpace */
+  static void access_error();
+  static void access_error( const void * const );
+
+private:
+
+  int  m_device ; ///< Which ROCm device
+
+  static constexpr const char* m_name = "ROCm";
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > ;
+};
+
+} // namespace Experimental
+
+namespace Impl {
+
+void * rocm_device_allocate(int);
+void * rocm_hostpinned_allocate(int);
+void rocm_device_free(void * );
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_arrays_rocm_space();
+
+/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* atomic_lock_array_rocm_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
+///
+/// Team and Thread private scratch allocations in
+/// global memory are aquired via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* scratch_lock_array_rocm_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for unique identifiers.
+///
+/// Unique identifiers in the range 0-ROCm::concurrency
+/// are provided via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* threadid_lock_array_rocm_space_ptr(bool deallocate = false);
+}
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+
+namespace Kokkos {
+namespace Experimental {
+/** \brief  Host memory that is accessible to ROCm execution space
+ *          through ROCm's host-pinned memory allocation.
+ */
+class ROCmHostPinnedSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  /** \brief  Memory is in HostSpace so use the HostSpace::execution_space */
+  typedef HostSpace::execution_space  execution_space ;
+  typedef ROCmHostPinnedSpace         memory_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef unsigned int                size_type ;
+
+  /*--------------------------------*/
+
+  ROCmHostPinnedSpace();
+  ROCmHostPinnedSpace( ROCmHostPinnedSpace && rhs ) = default ;
+  ROCmHostPinnedSpace( const ROCmHostPinnedSpace & rhs ) = default ;
+  ROCmHostPinnedSpace & operator = ( ROCmHostPinnedSpace && rhs ) = default ;
+  ROCmHostPinnedSpace & operator = ( const ROCmHostPinnedSpace & rhs ) = default ;
+  ~ROCmHostPinnedSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return m_name; };
+
+private:
+
+  static constexpr const char* m_name = "ROCmHostPinned";
+
+  /*--------------------------------*/
+};
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace >::assignable , "" );
+
+//----------------------------------------
+
+template<>
+struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace > {
+  enum { assignable = false };
+  enum { accessible = false };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace > {
+  // HostSpace::execution_space == ROCmHostPinnedSpace::execution_space
+  enum { assignable = true };
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+//----------------------------------------
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::HostSpace > {
+  enum { assignable = false };
+  enum { accessible = false };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace > {
+  // ROCmSpace::execution_space != ROCmHostPinnedSpace::execution_space
+  enum { assignable = false };
+  enum { accessible = true }; // ROCmSpace::execution_space
+  enum { deepcopy   = true };
+};
+
+
+//----------------------------------------
+// ROCmHostPinnedSpace::execution_space == HostSpace::execution_space
+// ROCmHostPinnedSpace accessible to both ROCm and Host
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::HostSpace > {
+  enum { assignable = false }; // Cannot access from ROCm
+  enum { accessible = true };  // ROCmHostPinnedSpace::execution_space
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmSpace > {
+  enum { assignable = false }; // Cannot access from Host
+  enum { accessible = false };
+  enum { deepcopy   = true };
+};
+
+};
+//----------------------------------------
+
+} // namespace Kokkos::Impl
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+hc::completion_future DeepCopyAsyncROCm( void * dst , const void * src , size_t n);
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm>
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+    fut.wait();
+//    DeepCopy (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm>( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopy (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopy (dst,src,n);
+  }
+};
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm>
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , Kokkos::Experimental::ROCm >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace>
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+    fut.wait();
+//    DeepCopyROCm (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+    fut.wait();
+//    DeepCopyROCm (dst,src,n);
+  }
+};
+
+
+
+template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+//    hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+//    fut.wait();
+//    DeepCopyAsyncROCm (dst,src,n);
+    DeepCopy (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , Kokkos::Experimental::ROCm>( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopy (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopy (dst,src,n);
+  }
+};
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** Running in ROCmSpace attempting to access HostSpace: error */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::ROCmSpace , Kokkos::HostSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("ROCm code attempted to access HostSpace memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("ROCm code attempted to access HostSpace memory"); }
+};
+
+/** Running in ROCmSpace accessing ROCmHostPinnedSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in ROCmSpace attempting to access an unknown space: error */
+template< class OtherSpace >
+struct VerifyExecutionCanAccessMemorySpace<
+  typename enable_if< ! is_same<Kokkos::Experimental::ROCmSpace,OtherSpace>::value , Kokkos::Experimental::ROCmSpace >::type ,
+  OtherSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("ROCm code attempted to access unknown Space memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("ROCm code attempted to access unknown Space memory"); }
+};
+
+//----------------------------------------------------------------------------
+/** Running in HostSpace attempting to access ROCmSpace */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace >
+{
+  enum { value = false };
+  inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
+  inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
+};
+
+/** Running in HostSpace accessing ROCmHostPinnedSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) {}
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
+};
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  const Kokkos::Experimental::ROCmSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+
+  SharedAllocationRecord( const Kokkos::Experimental::ROCmSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::Experimental::ROCmSpace &  arg_space
+                                          , const std::string       &  arg_label
+                                          , const size_t               arg_alloc_size );
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::Experimental::ROCmSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::Experimental::ROCmSpace & , bool detail = false );
+};
+
+template<>
+class SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  const Kokkos::Experimental::ROCmHostPinnedSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::Experimental::ROCmHostPinnedSpace     & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::Experimental::ROCmHostPinnedSpace &  arg_space
+                                          , const std::string          &  arg_label
+                                          , const size_t                  arg_alloc_size
+                                          );
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::Experimental::ROCmHostPinnedSpace & , bool detail = false );
+};
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_ROCM ) */
+#endif /* #define KOKKOS_ROCMSPACE_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
@ -681,6 +681,67 @@ public:
      return f ;
    }

+  template < class F >
+  KOKKOS_FUNCTION
+  Future< execution_space >
+  when_all( int narg , F const func )
+    {
+      using input_type  = decltype( func(0) );
+      using future_type = Future< execution_space > ;
+      using task_base   = Kokkos::Impl::TaskBase< void , void , void > ;
+
+      static_assert( is_future< input_type >::value
+                   , "Functor must return a Kokkos::Future" );
+
+      future_type f ;
+
+      if ( 0 == narg ) return f ;
+
+      size_t const alloc_size = m_queue->when_all_allocation_size( narg );
+
+      f.m_task =
+        reinterpret_cast< task_base * >( m_queue->allocate( alloc_size ) );
+
+      if ( f.m_task ) {
+
+        // Reference count starts at two:
+        // +1 to match decrement when task completes
+        // +1 for the future
+
+        new( f.m_task ) task_base();
+
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = alloc_size ;
+        f.m_task->m_dep_count  = narg ;
+        f.m_task->m_task_type  = task_base::Aggregate ;
+
+        // Assign dependences, reference counts were already incremented
+
+        task_base * volatile * const dep =
+          f.m_task->aggregate_dependences();
+
+        for ( int i = 0 ; i < narg ; ++i ) {
+          const input_type arg_f = func(i);
+          if ( 0 != arg_f.m_task ) {
+
+            if ( m_queue != static_cast< queue_type * >( arg_f.m_task->m_queue ) ) {
+              Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
+            }
+            // Increment reference count to track subsequent assignment.
+            Kokkos::atomic_increment( &(arg_f.m_task->m_ref_count) );
+            dep[i] = arg_f.m_task ;
+          }
+        }
+
+        Kokkos::memory_fence();
+
+        m_queue->schedule_aggregate( f.m_task );
+        // this when_all may be processed at any moment
+      }
+      return f ;
+    }
+
  //----------------------------------------

  KOKKOS_INLINE_FUNCTION
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@ -2429,6 +2429,7 @@ template < class ValueType >
 struct CommonViewAllocProp< void, ValueType >
 {
  using value_type = ValueType;
+  using scalar_array_type = ValueType;

  template < class ... Views >
  CommonViewAllocProp( const Views & ... ) {}
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
@ -0,0 +1,439 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <hc.hpp>
+//#include <hsa_atomic.h>
+
+#ifdef KOKKOS_ENABLE_ROCM_ATOMICS
+namespace Kokkos {
+  //ROCm can do:
+  //Types int/unsigned int
+  //variants: atomic_exchange/compare_exchange/fetch_add/fetch_sub/fetch_max/fetch_min/fetch_and/fetch_or/fetch_xor/fetch_inc/fetch_dec 
+
+
+  KOKKOS_INLINE_FUNCTION
+  int atomic_exchange(int* dest, const int& val) {
+    return hc::atomic_exchange_int(dest, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  unsigned int atomic_exchange(unsigned int* dest, const unsigned int& val) {
+    return hc::atomic_exchange_unsigned(dest, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int64_t atomic_exchange(int64_t* dest, const int64_t& val) {
+    return (int64_t)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  uint64_t atomic_exchange(uint64_t* dest, const uint64_t& val) {
+    return hc::atomic_exchange_uint64(dest, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  long long atomic_exchange(long long* dest, const long long& val) {
+    return (long long)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  unsigned long long atomic_exchange(unsigned long long* dest, const unsigned long long& val) {
+    return (unsigned long long)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  float atomic_exchange(float* dest, const float& val) {
+    union U {
+      int i ;
+      float f ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,ival;
+    idest.f = *dest;
+    ival.f = val;
+    idest.i = hc::atomic_exchange_int((int*)dest, ival.i);
+    return idest.f;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double atomic_exchange(double* dest, const double& val) {
+    union U {
+      uint64_t i ;
+      double d ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,ival;
+    idest.d = *dest;
+    ival.d = val;
+    idest.i = hc::atomic_exchange_uint64((uint64_t*)dest, ival.i);
+    return idest.d;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int atomic_compare_exchange(int* dest, int compare, const int& val);
+
+  KOKKOS_INLINE_FUNCTION
+  int64_t atomic_compare_exchange(int64_t* dest, int64_t compare, const int64_t& val);
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+    union U {
+      int i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+    assume.i = oldval.i ;
+    newval.t = val ;
+    atomic_compare_exchange( reinterpret_cast<int*>(dest) , assume.i, newval.i );
+
+    return oldval.t ;    
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
+    union U {
+      uint64_t i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+
+    assume.i = oldval.i ;
+    newval.t = val ;
+    atomic_compare_exchange( (int64_t*)(dest) , assume.i, newval.i );
+
+    return oldval.t ;    
+  }
+ 
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(int64_t), const T&>::type val) {
+    return val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int atomic_compare_exchange(int* dest, int compare, const int& val) {
+    return hc::atomic_compare_exchange_int(dest, compare, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  unsigned int atomic_compare_exchange(unsigned int* dest, unsigned int compare, const unsigned int& val) {
+    return hc::atomic_compare_exchange_unsigned(dest, compare, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int64_t atomic_compare_exchange(int64_t* dest, int64_t compare, const int64_t& val) {
+    return (int64_t) hc::atomic_compare_exchange_uint64((uint64_t*)dest, (uint64_t)compare, (const uint64_t&)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  uint64_t atomic_compare_exchange(uint64_t* dest, uint64_t compare, const uint64_t& val) {
+    return hc::atomic_compare_exchange_uint64(dest, compare, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  long long atomic_compare_exchange(long long* dest, long long compare, const long long& val) {
+    return (long long)hc::atomic_compare_exchange_uint64((uint64_t*)(dest), (uint64_t)(compare), (const uint64_t&)(val));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  float atomic_compare_exchange(float* dest, float compare, const float& val) {
+    union U {
+      int i ;
+      float f ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,icompare,ival;
+    idest.f = *dest;
+    icompare.f = compare;
+    ival.f = val;
+    idest.i = hc::atomic_compare_exchange_int(reinterpret_cast<int*>(dest), icompare.i, ival.i);
+    return idest.f;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double atomic_compare_exchange(double* dest, double compare, const double& val) {
+    union U {
+      uint64_t i ;
+      double d ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,icompare,ival;
+    idest.d = *dest;
+    icompare.d = compare;
+    ival.d = val;
+    idest.i = hc::atomic_compare_exchange_uint64(reinterpret_cast<uint64_t*>(dest), icompare.i, ival.i);
+    return idest.d;
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+    union U {
+      int i ;
+      T f ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,icompare,ival;
+    idest.f = *dest;
+    icompare.f = compare;
+    ival.f = val;
+    idest.i = hc::atomic_compare_exchange_int((int*)(dest), icompare.i, ival.i);
+    return idest.f;
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<sizeof(T) == sizeof(int64_t), const T&>::type val) {
+    union U {
+      uint64_t i ;
+      T f ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,icompare,ival;
+    idest.f = *dest;
+    icompare.f = compare;
+    ival.f = val;
+    idest.i = hc::atomic_compare_exchange_uint64((uint64_t*)(dest), icompare.i, ival.i);
+    return idest.f;
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<(sizeof(T) != sizeof(int32_t)) && (sizeof(T) != sizeof(int64_t)), const T&>::type val) {
+    return val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int atomic_fetch_add (volatile int * dest, const int& val) {
+    return hc::atomic_fetch_add((int *)dest, val);
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  unsigned int atomic_fetch_add(unsigned int* dest, const unsigned int& val) {
+    return hc::atomic_fetch_add(dest, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  unsigned long atomic_fetch_add(volatile unsigned long* dest, const unsigned long& val) {
+    return (unsigned long)hc::atomic_fetch_add((uint64_t *)dest, (const uint64_t)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int64_t atomic_fetch_add(volatile int64_t* dest, const int64_t& val) {
+    return (int64_t)hc::atomic_fetch_add((uint64_t *)dest, (const uint64_t&)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  char atomic_fetch_add(volatile char * dest, const char& val) {
+    unsigned int oldval,newval,assume;
+    oldval = *(int *)dest ;
+
+    do {
+      assume = oldval ;
+      newval = assume&0x7fffff00 + ((assume&0xff)+val)&0xff ;
+      oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+    } while ( assume != oldval );
+
+    return oldval ;    
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  short atomic_fetch_add(volatile short * dest, const short& val) {
+    unsigned int oldval,newval,assume;
+    oldval = *(int *)dest ;
+
+    do {
+      assume = oldval ;
+      newval = assume&0x7fff0000 + ((assume&0xffff)+val)&0xffff ;
+      oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+    } while ( assume != oldval );
+
+    return oldval ;    
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  long long atomic_fetch_add(volatile long long * dest, const long long& val) {
+    return (long long)hc::atomic_fetch_add((uint64_t*)dest, (const uint64_t&)val);
+  }
+
+
+
+  KOKKOS_INLINE_FUNCTION
+  int atomic_fetch_sub (volatile int * dest, const int& val) {
+    return hc::atomic_fetch_sub((int *)dest, val);
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  unsigned int atomic_fetch_sub(volatile unsigned int* dest, const unsigned int& val) {
+    return hc::atomic_fetch_sub((unsigned int *)dest, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int64_t atomic_fetch_sub(int64_t* dest, const int64_t& val) {
+    return (int64_t)hc::atomic_fetch_add((uint64_t *)dest, -(const uint64_t&)val);
+//    return (int64_t)hc::atomic_fetch_sub_uint64((uint64_t*)dest, (const uint64_t&)val);
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  char atomic_fetch_sub(volatile char * dest, const char& val) {
+    unsigned int oldval,newval,assume;
+    oldval = *(int *)dest ;
+
+    do {
+      assume = oldval ;
+      newval = assume&0x7fffff00 + ((assume&0xff)-val)&0xff ;
+      oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+    } while ( assume != oldval );
+
+    return oldval ;    
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  short atomic_fetch_sub(volatile short * dest, const short& val) {
+    unsigned int oldval,newval,assume;
+    oldval = *(int *)dest ;
+
+    do {
+      assume = oldval ;
+      newval = assume&0x7fff0000 + ((assume&0xffff)-val)&0xffff;
+      oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+    } while ( assume != oldval );
+
+    return oldval ;    
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  long long atomic_fetch_sub(volatile long long * dest, const long long& val) {
+    return (long long)hc::atomic_fetch_add((uint64_t*)dest, -(const uint64_t&)val);
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+    union U {
+      unsigned int i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+
+    do {
+      assume.i = oldval.i ;
+      newval.t = assume.t + val ;
+      oldval.i = atomic_compare_exchange( (unsigned int*)(dest) , assume.i , newval.i );
+    } while ( assume.i != oldval.i );
+
+    return oldval.t ;    
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
+    union U {
+      uint64_t i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+
+    do {
+      assume.i = oldval.i ;
+      newval.t = assume.t + val ;
+      oldval.i = atomic_compare_exchange( (uint64_t*)dest , assume.i , newval.i );
+    } while ( assume.i != oldval.i );
+
+    return oldval.t ;    
+  }
+
+
+  //WORKAROUND
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(int64_t), const T&>::type val) {
+    return val ;
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_sub(volatile T* dest, typename std::enable_if<sizeof(T) == sizeof(int),T>::type & val) {
+    union U {
+      int i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+
+    do {
+      assume.i = oldval.i ;
+      newval.t = assume.t - val ;
+      oldval.i = Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+    } while ( assume.i != oldval.i );
+
+    return oldval.t ;
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_sub(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
+    union U {
+      int64_t i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+
+    do {
+      assume.i = oldval.i ;
+      newval.t = assume.t - val ;
+      oldval.i = atomic_compare_exchange( (int64_t*)dest , assume.i , newval.i );
+    } while ( assume.i != oldval.i );
+
+    return oldval.t ;    
+  }
+}
+#endif
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Config.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Config.hpp
@ -0,0 +1,51 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef GUARD_CORE_KOKKOS_ROCM_CONFIG_HPP
+#define GUARD_CORE_KOKKOS_ROCM_CONFIG_HPP
+
+#ifndef KOKKOS_ROCM_HAS_WORKAROUNDS
+#define KOKKOS_ROCM_HAS_WORKAROUNDS 1
+#endif
+
+#endif
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp
@ -0,0 +1,133 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ROCMEXEC_HPP
+#define KOKKOS_ROCMEXEC_HPP
+
+#include <algorithm>
+#include <typeinfo>
+#include <Kokkos_Macros.hpp>
+//#include <ROCm/Kokkos_ROCmExec.hpp>
+#include <hc.hpp>
+
+#define ROCM_SPACE_ATOMIC_MASK      0x1FFFF
+#define ROCM_SPACE_ATOMIC_XOR_MASK  0x15A39
+#define ROCM_CONCURRENCY 20480
+//#define ROCM_CONCURRENCY 81920  # for fiji
+
+namespace Kokkos {
+  static int rocm_space_atomic_locks[ROCM_SPACE_ATOMIC_MASK+1];
+  static int rocm_space_scratch_locks[ROCM_CONCURRENCY];
+  static int rocm_space_threadid_locks[ROCM_CONCURRENCY];
+namespace Impl {
+// TODO: mimic cuda implemtation, add dgpu capability
+
+  void init_rocm_atomic_lock_array() {
+    static int is_initialized = 0;
+    if(!is_initialized)
+    {
+      for(int i = 0; i < ROCM_SPACE_ATOMIC_MASK+1; i++)
+        rocm_space_atomic_locks[i] = 0;
+      is_initialized = 1;
+    }
+  }
+
+  void init_rocm_scratch_lock_array() {
+    static int is_initialized = 0;
+    if(!is_initialized)
+    {
+      for(int i = 0; i < ROCM_CONCURRENCY; i++)
+        rocm_space_scratch_locks[i] = 0;
+      is_initialized = 1;
+    }
+  }
+
+  void init_rocm_threadid_lock_array() {
+    static int is_initialized = 0;
+    if(!is_initialized)
+    {
+      for(int i = 0; i < ROCM_CONCURRENCY; i++)
+        rocm_space_threadid_locks[i] = 0;
+      is_initialized = 1;
+    }
+  }
+
+  void init_lock_arrays_rocm_space() {
+     init_rocm_atomic_lock_array();
+//     init_rocm_scratch_lock_array();
+//     init_rocm_threadid_lock_array();
+  }
+}
+
+} // namespace Kokkos
+#if 0
+namespace Kokkos {
+namespace Impl {
+KOKKOS_INLINE_FUNCTION
+bool lock_address_rocm_space(void* ptr) {
+#if 0
+return(Kokkos::Impl::lock_address_host_space(ptr));
+#else
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & ROCM_SPACE_ATOMIC_MASK;
+  return (0 == hc::atomic_compare_exchange(&rocm_space_atomic_locks[offset],0,1));
+#endif
+}
+
+KOKKOS_INLINE_FUNCTION
+void unlock_address_rocm_space(void* ptr) {
+#if 0
+Kokkos::Impl::unlock_address_host_space(ptr) ;
+#else
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & ROCM_SPACE_ATOMIC_MASK;
+  hc::atomic_exchange( &rocm_space_atomic_locks[ offset ], 0);
+#endif
+}
+
+}
+} // namespace Kokkos
+#endif
+
+#endif /* #ifndef KOKKOS_ROCMEXEC_HPP */
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp
@ -0,0 +1,137 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ROCMEXEC_HPP
+#define KOKKOS_ROCMEXEC_HPP
+
+#include <algorithm>
+#include <typeinfo>
+
+#if defined(__HCC_ACCELERATOR__)
+#define printf(...)
+#endif
+
+namespace Kokkos {
+namespace Impl {
+struct ROCmTraits {
+// TODO: determine if needed
+  enum { WavefrontSize       = 64 /* 64  */ };
+  enum { WorkgroupSize       = 64 /* 64  */ };
+  enum { WavefrontIndexMask  = 0x001f  /* Mask for warpindex */ };
+  enum { WavefrontIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
+
+  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
+  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
+  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
+
+  enum { UpperBoundExtentCount    = 65535 /* Hard upper bound */ };
+#if 0
+  KOKKOS_INLINE_FUNCTION static
+  ROCmSpace::size_type wavefront_count( ROCmSpace::size_type i )
+    { return ( i +  WavefrontIndexMask ) >>  WavefrontIndexShift ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  ROCmSpace::size_type wavefront_align( ROCmSpace::size_type i )
+    {
+      enum { Mask = ~ROCmSpace::size_type(  WavefrontIndexMask ) };
+      return ( i +  WavefrontIndexMask ) & Mask ;
+    }
+#endif
+};
+size_t rocm_internal_cu_count();
+size_t rocm_internal_maximum_workgroup_count();
+
+size_t * rocm_internal_scratch_flags( const size_t size );
+size_t * rocm_internal_scratch_space( const size_t size );
+
+}
+} // namespace Kokkos
+#define ROCM_SPACE_ATOMIC_MASK      0x1FFFF
+#define ROCM_SPACE_ATOMIC_XOR_MASK  0x15A39
+//int rocm_space_atomic_locks[ROCM_SPACE_ATOMIC_MASK+1];
+extern int
+   *rocm_space_atomic_locks;
+
+namespace Kokkos {
+namespace Impl {
+  void init_lock_arrays_rocm_space();
+
+  void* rocm_resize_scratch_space(size_t bytes, bool force_shrink = false);
+
+// TODO: determine if needed
+KOKKOS_INLINE_FUNCTION
+bool lock_address_rocm_space(void* ptr) {
+#if 0
+return(Kokkos::Impl::lock_address_host_space(ptr));
+#else
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & ROCM_SPACE_ATOMIC_MASK;
+  return (0 == hc::atomic_compare_exchange(&rocm_space_atomic_locks[offset],0,1));
+#endif
+}
+KOKKOS_INLINE_FUNCTION
+void unlock_address_rocm_space(void* ptr) {
+#if 0
+Kokkos::Impl::unlock_address_host_space(ptr) ;
+#else
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & ROCM_SPACE_ATOMIC_MASK;
+  hc::atomic_exchange( &rocm_space_atomic_locks[ offset ], 0);
+#endif
+}
+
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+//extern 
+//KOKKOS_INLINE_FUNCTION
+//void init_lock_arrays_rocm_space(); 
+
+
+}
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_ROCMEXEC_HPP */
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
@ -0,0 +1,753 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+/* Kokkos interfaces */
+
+#include <Kokkos_Core.hpp>
+
+/* only compile this file if ROCM is enabled for Kokkos */
+#ifdef KOKKOS_ENABLE_ROCM
+
+//#include <ROCm/Kokkos_ROCm_Internal.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_ROCmSpace.hpp>
+#include <ROCm/Kokkos_ROCm_Exec.hpp>
+
+/*--------------------------------------------------------------------------*/
+/* Standard 'C' libraries */
+#include <stdlib.h>
+
+/* Standard 'C++' libraries */
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+
+
+//KOKKOS_INLINE_FUNCTION
+// Kokkos::Impl::ROCmLockArraysStruct kokkos_impl_rocm_lock_arrays ;
+
+
+/*--------------------------------------------------------------------------*/
+namespace Kokkos {
+namespace Impl {
+
+#if 0
+namespace {
+__global__
+void query_rocm_kernel_arch( int * d_arch )
+{
+#if defined( __HCC_ACCELERATOR__ )
+  *d_arch = OCM_ARCH__ ;
+#else
+  *d_arch = 0 ;
+#endif
+}
+
+/** Query what compute capability is actually launched to the device: */
+int rocm_kernel_arch()
+{
+  int * d_arch = 0 ;
+  rocmMalloc( (void **) & d_arch , sizeof(int) );
+  query_rocm_kernel_arch<<<1,1>>>( d_arch );
+  int arch = 0 ;
+  rocmMemcpy( & arch , d_arch , sizeof(int) , rocmMemcpyDefault );
+  rocmFree( d_arch );
+  return arch ;
+}
+bool rocm_launch_blocking()
+{
+  const char * env = getenv("ROCM_LAUNCH_BLOCKING");
+
+  if (env == 0) return false;
+
+  return atoi(env);
+}
+
+}
+#endif
+
+// true device memory allocation, not visible from host
+void * rocm_device_allocate(int size)
+{
+  void * ptr;
+  hc::accelerator acc;
+  ptr = hc::am_alloc(size,acc,0);
+  return ptr;
+}
+
+// host pinned allocation
+// flag = 1, non-coherent, host resident, but with gpu address space pointer
+// flag = 2, coherent, host resident, but with host address space pointer
+void * rocm_hostpinned_allocate(int size)
+{
+  void * ptr;
+  hc::accelerator acc;
+  ptr = hc::am_alloc(size,acc,2);
+  return ptr;
+}
+// same free used by all rocm memory allocations
+void rocm_device_free(void * ptr)
+{
+  hc::am_free(ptr);
+}
+
+
+KOKKOS_INLINE_FUNCTION
+void rocm_device_synchronize()
+{
+   hc::accelerator_view av = hc::accelerator().get_default_view();
+   hc::completion_future fut = av.create_marker();
+   fut.wait();
+}
+
+void rocm_internal_error_throw( const char * name, const char * file, const int line )
+{
+#if 0
+  std::ostringstream out ;
+  out << name << " error( " << rocmGetErrorName(e) << "): " << rocmGetErrorString(e);
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw_runtime_exception( out.str() );
+#endif
+}
+
+//----------------------------------------------------------------------------
+// Some significant rocm device properties:
+//
+// rocmDeviceProp::name                : Text label for device
+// rocmDeviceProp::major               : Device major number
+// rocmDeviceProp::minor               : Device minor number
+// rocmDeviceProp::workgroupSize       : number of threads per workgroup
+// rocmDeviceProp::multiProcessorCount : number of multiprocessors
+// rocmDeviceProp::sharedMemPerBlock   : capacity of shared memory per wavefront
+// rocmDeviceProp::totalConstMem       : capacity of constant memory
+// rocmDeviceProp::totalGlobalMem      : capacity of global memory
+// rocmDeviceProp::maxGridSize[3]      : maximum grid size
+
+//
+//
+// the data we have available from a ROCm accelerator
+// std::wstring get_device_path()
+// std::wstring get_description()
+// unsigned int get_version()
+// bool get_has_display()
+// size_t get_dedicated_memory()
+// bool get_supports_double_precision()
+// bool get_supports_limited_double_precision()
+// bool get_is_debug()
+// bool get_supports_cpu_shared_memory()
+// size_t get_max_tile_static_size()
+// unsigned int get_cu_count()
+// bool has_cpu_accessible_am() 
+struct rocmDeviceProp {
+   char name[256];
+   char description[256];
+   unsigned int version;
+   int device_type;
+   int device_ordinal;
+   int major;
+   int minor;
+   size_t totalGlobalMem;
+   size_t sharedMemPerWavefront;
+   int WavefrontSize;
+   int WorkgroupSize;
+   int MaxTileCount;
+   int maxThreadsPerWorkgroup;
+   int multiProcessorCount;
+   int canMapHostMemory;
+   bool APU;
+};
+
+
+
+void rocmGetDeviceProperties(struct rocmDeviceProp* devProp, int device)
+{
+   std::wstring s;
+   int i,n;
+   hc::accelerator acc;
+   std::vector<hc::accelerator> accv = acc.get_all() ;
+
+   hc::accelerator a = accv[device];
+
+   s=a.get_device_path();
+   i = 0;
+   for(wchar_t c: s)
+      if((n=std::wctomb(&devProp->name[i],c))>0)
+         i+=n;
+
+   /* assume a CPU */
+   devProp->version = a.get_version();
+   devProp->major = a.get_version()>>16; // for CPU, these are meaningless 
+   devProp->minor = a.get_version()&0xff;
+   devProp->device_ordinal = 0;
+
+   /* is this an AMD graphics card */
+   if((devProp->name[0]=='g') && (devProp->name[1]=='f') 
+                              && (devProp->name[2]=='x')) {
+   /* for AMD cards, the name has the format gfxMmmO */
+     
+      devProp->device_type    = ((devProp->name[3]-0x30)<<16)
+                              + ((devProp->name[4]-0x30)<<8)
+                              +  (devProp->name[5]-0x30);
+      devProp->device_ordinal = devProp->name[6]-0x30;
+      devProp->major = devProp->name[3]-0x30;
+      devProp->minor = devProp->name[5]-0x30;
+   }
+
+   s=a.get_description();
+   i = 0;
+   for(wchar_t c: s)
+      if((n=std::wctomb(&devProp->description[i],c))>0)
+         i+=n;
+   devProp->totalGlobalMem = a.get_dedicated_memory();
+   devProp->sharedMemPerWavefront = a.get_max_tile_static_size();
+   devProp->WavefrontSize = 64;
+   devProp->WorkgroupSize = 256; // preferred
+   devProp->MaxTileCount  = 409600; // as defined in /opt/rocm/hcc-lc/include/hsa_new.h
+   devProp->maxThreadsPerWorkgroup = 1024;
+   devProp->multiProcessorCount = a.get_cu_count();
+   devProp->canMapHostMemory = a.get_supports_cpu_shared_memory();
+// Kaveri has 64KB L2 per CU, 16KB L1, 64KB Vector Regs/SIMD, or 128 regs/thread
+// GCN has 64KB LDS per CU
+
+//Kaveri APU is 7:0:0
+//Carrizo APU is 8:0:1
+   devProp->APU = (((devProp->major==7)&&(devProp->minor==0))|
+                   ((devProp->major==8)&&(devProp->minor==1)))?true:false;
+}
+
+namespace {
+
+
+
+class ROCmInternalDevices {
+public:
+  enum { MAXIMUM_DEVICE_COUNT = 64 };
+  struct rocmDeviceProp  m_rocmProp[ MAXIMUM_DEVICE_COUNT ] ;
+  int                    m_rocmDevCount ;
+
+  ROCmInternalDevices();
+
+  static const ROCmInternalDevices & singleton();
+};
+
+ROCmInternalDevices::ROCmInternalDevices()
+{
+   hc::accelerator acc;
+   std::vector<hc::accelerator> accv = acc.get_all() ;
+   m_rocmDevCount = accv.size();
+
+  if(m_rocmDevCount > MAXIMUM_DEVICE_COUNT) {
+    Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
+  }
+  for ( int i = 0 ; i < m_rocmDevCount ; ++i ) {
+    rocmGetDeviceProperties( m_rocmProp + i , i );
+  }
+}
+
+const ROCmInternalDevices & ROCmInternalDevices::singleton()
+{
+  static ROCmInternalDevices* self = nullptr;
+  if (!self) {
+    self = new ROCmInternalDevices();
+  }
+  return *self;
+
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+class ROCmInternal {
+private:
+
+  ROCmInternal( const ROCmInternal & );
+  ROCmInternal & operator = ( const ROCmInternal & );
+
+
+public:
+
+  typedef Kokkos::Experimental::ROCm::size_type size_type ;
+
+  int         m_rocmDev ;
+  int         m_rocmArch ;
+  unsigned    m_multiProcCount ;
+  unsigned    m_maxWorkgroup ;
+  unsigned    m_maxSharedWords ;
+  size_type   m_scratchSpaceCount ;
+  size_type   m_scratchFlagsCount ;
+  size_type * m_scratchSpace ;
+  size_type * m_scratchFlags ;
+
+  static int was_finalized;
+
+  static ROCmInternal & singleton();
+
+  int verify_is_initialized( const char * const label ) const ;
+
+  int is_initialized() const
+    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+
+  void initialize( int rocm_device_id );
+  void finalize();
+
+  void print_configuration( std::ostream & ) const ;
+
+
+  ~ROCmInternal();
+
+  ROCmInternal()
+    : m_rocmDev( -1 )
+    , m_rocmArch( -1 )
+    , m_multiProcCount( 0 )
+    , m_maxWorkgroup( 0 )
+    , m_maxSharedWords( 0 )
+    , m_scratchSpaceCount( 0 )
+    , m_scratchFlagsCount( 0 )
+    , m_scratchSpace( 0 )
+    , m_scratchFlags( 0 )
+    {}
+
+  size_type * scratch_space( const size_type size );
+  size_type * scratch_flags( const size_type size );
+};
+
+int ROCmInternal::was_finalized = 0;
+//----------------------------------------------------------------------------
+
+
+void ROCmInternal::print_configuration( std::ostream & s ) const
+{
+  const ROCmInternalDevices & dev_info = ROCmInternalDevices::singleton();
+
+#if defined( KOKKOS_ENABLE_ROCM )
+    s << "macro  KOKKOS_ENABLE_ROCM      : defined" << std::endl ;
+#endif
+#if defined( __hcc_version__ )
+    s << "macro  __hcc_version__          = " << __hcc_version__
+      << std::endl ;
+#endif
+
+  for ( int i = 0 ; i < dev_info.m_rocmDevCount ; ++i ) {
+    s << "Kokkos::Experimental::ROCm[ " << i << " ] "
+      << dev_info.m_rocmProp[i].name
+      << " version " << (dev_info.m_rocmProp[i].major) << "." << dev_info.m_rocmProp[i].minor
+      << ", Total Global Memory: " << human_memory_size(dev_info.m_rocmProp[i].totalGlobalMem)
+      << ", Shared Memory per Wavefront: " << human_memory_size(dev_info.m_rocmProp[i].sharedMemPerWavefront);
+    if ( m_rocmDev == i ) s << " : Selected" ;
+    s << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+ROCmInternal::~ROCmInternal()
+{
+  if ( m_scratchSpace ||
+       m_scratchFlags ) {
+    std::cerr << "Kokkos::Experimental::ROCm ERROR: Failed to call Kokkos::Experimental::ROCm::finalize()"
+              << std::endl ;
+    std::cerr.flush();
+  }
+
+  m_rocmDev                 = -1 ;
+  m_rocmArch                = -1 ;
+  m_multiProcCount          = 0 ;
+  m_maxWorkgroup            = 0 ;
+  m_maxSharedWords          = 0 ;
+  m_scratchSpaceCount       = 0 ;
+  m_scratchFlagsCount       = 0 ;
+  m_scratchSpace            = 0 ;
+  m_scratchFlags            = 0 ;
+}
+
+int ROCmInternal::verify_is_initialized( const char * const label ) const
+{
+  if ( m_rocmDev < 0 ) {
+    std::cerr << "Kokkos::Experimental::ROCm::" << label << " : ERROR device not initialized" << std::endl ;
+  }
+  return 0 <= m_rocmDev ;
+}
+
+ROCmInternal & ROCmInternal::singleton()
+{
+  static ROCmInternal* self = nullptr ;
+  if (!self) {
+    self = new ROCmInternal();
+  }
+  return *self ;
+
+}
+
+void ROCmInternal::initialize( int rocm_device_id  )
+{
+  if ( was_finalized ) Kokkos::abort("Calling ROCm::initialize after ROCm::finalize is illegal\n");
+
+  if ( is_initialized() ) return;
+
+  enum { WordSize = sizeof(size_type) };
+
+  if ( ! HostSpace::execution_space::is_initialized() ) {
+    const std::string msg("ROCm::initialize ERROR : HostSpace::execution_space is not initialized");
+    throw_runtime_exception( msg );
+  }
+
+  const ROCmInternalDevices & dev_info = ROCmInternalDevices::singleton();
+
+  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
+
+  const bool ok_id   = 1 <= rocm_device_id &&
+                            rocm_device_id < dev_info.m_rocmDevCount ;
+
+  // Need at least a GPU device
+
+  const bool ok_dev = ok_id &&
+    ( 1 <= dev_info.m_rocmProp[ rocm_device_id ].major &&
+      0 <= dev_info.m_rocmProp[ rocm_device_id ].minor );
+  if ( ok_init && ok_dev ) {
+
+    const struct rocmDeviceProp & rocmProp =
+      dev_info.m_rocmProp[ rocm_device_id ];
+
+    m_rocmDev = rocm_device_id ;
+
+//  rocmSetDevice( m_rocmDev ) );
+    Kokkos::Impl::rocm_device_synchronize();
+
+/*
+    // Query what compute capability architecture a kernel executes:
+    m_rocmArch = rocm_kernel_arch();
+    if ( m_rocmArch != rocmProp.major * 100 + rocmProp.minor * 10 ) {
+      std::cerr << "Kokkos::Experimental::ROCm::initialize WARNING: running kernels compiled for compute capability "
+                << ( m_rocmArch / 100 ) << "." << ( ( m_rocmArch % 100 ) / 10 )
+                << " on device with compute capability "
+                << rocmProp.major << "." << rocmProp.minor
+                << " , this will likely reduce potential performance."
+                << std::endl ;
+    }
+*/
+    // number of multiprocessors
+
+    m_multiProcCount = rocmProp.multiProcessorCount ;
+
+    //----------------------------------
+    // Maximum number of wavefronts,
+    // at most one workgroup per thread in a workgroup for reduction.
+
+
+    m_maxSharedWords = rocmProp.sharedMemPerWavefront/ WordSize ;
+
+    //----------------------------------
+    // Maximum number of Workgroups:
+
+    m_maxWorkgroup = 5*rocmProp.multiProcessorCount;  //TODO: confirm usage and value
+
+    //----------------------------------
+    // Multiblock reduction uses scratch flags for counters
+    // and scratch space for partial reduction values.
+    // Allocate some initial space.  This will grow as needed.
+
+    {
+      const unsigned reduce_block_count = m_maxWorkgroup * Impl::ROCmTraits::WorkgroupSize ;
+
+      (void) scratch_flags( reduce_block_count * 2  * sizeof(size_type) );
+      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
+    }
+    //----------------------------------
+
+  }
+  else {
+
+    std::ostringstream msg ;
+    msg << "Kokkos::Experimental::ROCm::initialize(" << rocm_device_id << ") FAILED" ;
+
+    if ( ! ok_init ) {
+      msg << " : Already initialized" ;
+    }
+    if ( ! ok_id ) {
+      msg << " : Device identifier out of range "
+          << "[0.." << (dev_info.m_rocmDevCount-1) << "]" ;
+    }
+    else if ( ! ok_dev ) {
+      msg << " : Device " ;
+      msg << dev_info.m_rocmProp[ rocm_device_id ].major ;
+      msg << "." ;
+      msg << dev_info.m_rocmProp[ rocm_device_id ].minor ;
+      msg << " Need at least a GPU" ;
+      msg << std::endl;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+
+  // Init the array for used for arbitrarily sized atomics
+  Kokkos::Impl::init_lock_arrays_rocm_space();
+
+//  Kokkos::Impl::ROCmLockArraysStruct locks;
+//  locks.atomic = atomic_lock_array_rocm_space_ptr(false);
+//  locks.scratch = scratch_lock_array_rocm_space_ptr(false);
+//  locks.threadid = threadid_lock_array_rocm_space_ptr(false);
+//  rocmMemcpyToSymbol( kokkos_impl_rocm_lock_arrays , & locks , sizeof(ROCmLockArraysStruct) );
+}
+
+//----------------------------------------------------------------------------
+
+typedef Kokkos::Experimental::ROCm::size_type ScratchGrain[ Impl::ROCmTraits::WorkgroupSize ] ;
+enum { sizeScratchGrain = sizeof(ScratchGrain) };
+
+void rocmMemset(  Kokkos::Experimental::ROCm::size_type * ptr ,  Kokkos::Experimental::ROCm::size_type value , Kokkos::Experimental::ROCm::size_type size)
+{
+char * mptr = (char * ) ptr;
+#if 0
+   parallel_for_each(hc::extent<1>(size),
+                    [=, &ptr]
+                    (hc::index<1> idx) __HC__
+   {
+      int i = idx[0];
+      ptr[i] = value;
+   }).wait();
+#else
+   for (int i= 0; i<size ; i++)
+   {
+     mptr[i] = (char) value;
+   }
+#endif
+}
+
+Kokkos::Experimental::ROCm::size_type *
+ROCmInternal::scratch_flags( const Kokkos::Experimental::ROCm::size_type size )
+{
+  if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
+
+
+    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+
+    Record * const r = Record::allocate( Kokkos::HostSpace()
+                                       , "InternalScratchFlags"
+                                       , ( sizeScratchGrain  * m_scratchFlagsCount ) );
+
+    Record::increment( r );
+
+    m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
+
+    rocmMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain );
+  }
+
+  return m_scratchFlags ;
+}
+
+Kokkos::Experimental::ROCm::size_type *
+ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size )
+{
+  if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
+
+    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+     typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+
+     Record * const r = Record::allocate( Kokkos::HostSpace()
+                                        , "InternalScratchSpace"
+                                        , ( sizeScratchGrain  * m_scratchSpaceCount ) );
+
+     Record::increment( r );
+
+     m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
+  }
+
+  return m_scratchSpace ;
+}
+
+//----------------------------------------------------------------------------
+
+void ROCmInternal::finalize()
+{
+  was_finalized = 1;
+  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
+
+//    atomic_lock_array_rocm_space_ptr(false);
+//    scratch_lock_array_rocm_space_ptr(false);
+//    threadid_lock_array_rocm_space_ptr(false);
+
+    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< HostSpace > RecordROCm ;
+    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace > RecordHost ;
+
+    RecordROCm::decrement( RecordROCm::get_record( m_scratchFlags ) );
+    RecordROCm::decrement( RecordROCm::get_record( m_scratchSpace ) );
+
+    m_rocmDev             = -1 ;
+    m_multiProcCount      = 0 ;
+    m_maxWorkgroup        = 0 ;
+    m_maxSharedWords      = 0 ;
+    m_scratchSpaceCount   = 0 ;
+    m_scratchFlagsCount   = 0 ;
+    m_scratchSpace        = 0 ;
+    m_scratchFlags        = 0 ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+Kokkos::Experimental::ROCm::size_type rocm_internal_cu_count()
+{ return ROCmInternal::singleton().m_multiProcCount ; }
+
+Kokkos::Experimental::ROCm::size_type rocm_internal_maximum_extent_size()
+{ return ROCmInternal::singleton().m_maxWorkgroup ; }
+
+Kokkos::Experimental::ROCm::size_type rocm_internal_maximum_shared_words()
+{ return ROCmInternal::singleton().m_maxSharedWords ; }
+
+Kokkos::Experimental::ROCm::size_type * rocm_internal_scratch_space( const Kokkos::Experimental::ROCm::size_type size )
+{ return ROCmInternal::singleton().scratch_space( size ); }
+
+Kokkos::Experimental::ROCm::size_type * rocm_internal_scratch_flags( const Kokkos::Experimental::ROCm::size_type size )
+{ return ROCmInternal::singleton().scratch_flags( size ); }
+
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+//ROCm::size_type ROCm::detect_device_count()
+//{ return Impl::ROCmInternalDevices::singleton().m_rocmDevCount ; }
+
+int ROCm::concurrency() {
+#if defined(KOKKOS_ARCH_KAVERI) 
+  return 8*64*40;  // 20480 kaveri
+#else
+  return 32*8*40;  // 81920 fiji and hawaii
+#endif
+}
+int ROCm::is_initialized()
+{ return Kokkos::Impl::ROCmInternal::singleton().is_initialized(); }
+
+void ROCm::initialize( const ROCm::SelectDevice config )
+{
+  Kokkos::Impl::ROCmInternal::singleton().initialize( config.rocm_device_id );
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
+
+#if 0
+std::vector<unsigned>
+ROCm::detect_device_arch()
+{
+  const Impl::ROCmInternalDevices & s = Impl::ROCmInternalDevices::singleton();
+
+  std::vector<unsigned> output( s.m_rocmDevCount );
+
+  for ( int i = 0 ; i < s.m_rocmDevCount ; ++i ) {
+    output[i] = s.m_rocmProp[i].major * 100 + s.m_rocmProp[i].minor ;
+  }
+
+  return output ;
+}
+
+ROCm::size_type ROCm::device_arch()
+{
+  return 1 ;
+}
+#endif
+
+void ROCm::finalize()
+{
+  Kokkos::Impl::ROCmInternal::singleton().finalize();
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+}
+
+ROCm::ROCm()
+  : m_device( Kokkos::Impl::ROCmInternal::singleton().m_rocmDev )
+{
+  Kokkos::Impl::ROCmInternal::singleton().verify_is_initialized( "ROCm instance constructor" );
+}
+
+bool ROCm::isAPU(int device) {
+  const Kokkos::Impl::ROCmInternalDevices & dev_info = 
+              Kokkos::Impl::ROCmInternalDevices::singleton();
+  return (dev_info.m_rocmProp[device].APU);  
+}
+
+bool ROCm::isAPU() {
+  return ROCm::isAPU(rocm_device());  
+}
+
+//ROCm::ROCm( const int instance_id )
+//  : m_device( Impl::ROCmInternal::singleton().m_rocmDev )
+//{}
+
+void ROCm::print_configuration( std::ostream & s , const bool )
+{ Kokkos::Impl::ROCmInternal::singleton().print_configuration( s ); }
+
+bool ROCm::sleep() { return false ; }
+
+bool ROCm::wake() { return true ; }
+
+void ROCm::fence()
+{
+  Kokkos::Impl::rocm_device_synchronize();
+}
+
+const char* ROCm::name() { return "ROCm"; }
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif // KOKKOS_ENABLE_ROCM
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp
@ -0,0 +1,138 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <type_traits>
+#include <Kokkos_Macros.hpp>
+
+#if !defined( KOKKOS_ROCM_INVOKE_H )
+#define KOKKOS_ROCM_INVOKE_H
+
+namespace Kokkos {
+namespace Impl {
+
+template<class Tag, class F, class... Ts, typename std::enable_if<(!std::is_void<Tag>()), int>::type = 0>
+KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs)
+{
+  f(Tag(), static_cast<Ts&&>(xs)...);
+}
+
+template<class Tag, class F, class... Ts, typename std::enable_if<(std::is_void<Tag>()), int>::type = 0>
+KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs)
+{
+  f(static_cast<Ts&&>(xs)...);
+}
+
+
+template<class F, class Tag=void>
+struct rocm_invoke_fn
+{
+    F* f;
+    rocm_invoke_fn(F& f_) : f(&f_)
+    {}
+
+    template<class... Ts>
+    KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const
+    {
+        rocm_invoke<Tag>(*f, static_cast<Ts&&>(xs)...);
+    }
+};
+
+template<class Tag, class F>
+KOKKOS_INLINE_FUNCTION rocm_invoke_fn<F, Tag> make_rocm_invoke_fn(F& f)
+{
+    return {f};
+}
+
+template<class T>
+KOKKOS_INLINE_FUNCTION T& rocm_unwrap(T& x)
+{
+    return x;
+}
+
+template<class T>
+KOKKOS_INLINE_FUNCTION T& rocm_unwrap(std::reference_wrapper<T> x)
+{
+    return x;
+}
+
+template<class F, class T>
+struct rocm_capture_fn
+{
+    F f;
+    T data;
+
+    KOKKOS_INLINE_FUNCTION rocm_capture_fn(F f_, T x) 
+    : f(f_), data(x)
+    {}
+
+    template<class... Ts>
+    KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const
+    {
+        f(rocm_unwrap(data), static_cast<Ts&&>(xs)...);
+    }
+};
+
+template<class F, class T>
+KOKKOS_INLINE_FUNCTION rocm_capture_fn<F, T> rocm_capture(F f, T x)
+{
+    return {f, x};
+}
+
+template<class F, class T, class U, class... Ts>
+KOKKOS_INLINE_FUNCTION auto rocm_capture(F f, T x, U y, Ts... xs) -> decltype(rocm_capture(rocm_capture(f, x), y, xs...))
+{
+    return rocm_capture(rocm_capture(f, x), y, xs...);
+}
+
+struct rocm_apply_op
+{
+    template<class F, class... Ts>
+    KOKKOS_INLINE_FUNCTION void operator()(F&& f, Ts&&... xs) const
+    {
+        f(static_cast<Ts&&>(xs)...);
+    }
+};
+
+}}
+
+#endif
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Join.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Join.hpp
@ -0,0 +1,72 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if !defined( KOKKOS_ROCM_JOIN_H )
+#define KOKKOS_ROCM_JOIN_H
+
+namespace Kokkos {
+namespace Impl {
+
+
+// Adaptor to use ValueJoin with standard algorithms
+template<class Joiner, class F>
+struct join_operator
+{
+  const F* fp;
+  template<class T, class U>
+  T operator()(T x, const U& y) const
+  {
+    Joiner::join(*fp, &x, &y);
+    return x;
+  }
+};
+
+template<class Joiner, class F>
+join_operator<Joiner, F> make_join_operator(const F& f)
+{
+  return join_operator<Joiner, F>{&f};
+}
+
+}}
+
+#endif
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
@ -0,0 +1,193 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+// AMP REDUCE
+//////////////////////////////////////////////////////////////////////////////
+
+#if !defined( KOKKOS_ROCM_AMP_REDUCE_INL )
+#define KOKKOS_ROCM_AMP_REDUCE_INL
+
+#include <iostream>
+
+#include <algorithm>
+#include <numeric>
+#include <cmath>
+#include <type_traits>
+#include <ROCm/Kokkos_ROCm_Tile.hpp>
+#include <ROCm/Kokkos_ROCm_Invoke.hpp>
+#include <ROCm/Kokkos_ROCm_Join.hpp>
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace Kokkos {
+namespace Impl {
+
+template<class T>
+T* reduce_value(T* x, std::true_type) [[hc]]
+{
+  return x;
+}
+
+template<class T>
+T& reduce_value(T* x, std::false_type) [[hc]]
+{
+  return *x;
+}
+
+#if KOKKOS_ROCM_HAS_WORKAROUNDS
+struct always_true
+{
+    template<class... Ts>
+    bool operator()(Ts&&...) const
+    {
+        return true;
+    }
+};
+#endif
+
+template< class Tag, class F, class ReducerType, class Invoker, class T >
+void reduce_enqueue(
+  const int szElements,  // size of the extent
+  const F & f,
+  const ReducerType& reducer,
+  Invoker invoke,
+  T * const output_result,
+  int const output_length,
+  const int team_size=64,
+  const int vector_size=1,
+  int const shared_size=0)
+{
+  using namespace hc ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, F, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , Tag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , Tag >   ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , Tag >   ValueJoin ;
+  typedef Kokkos::Impl::FunctorFinal< ReducerTypeFwd , Tag >       ValueFinal ;
+
+  typedef typename ValueTraits::pointer_type   pointer_type ;
+  typedef typename ValueTraits::reference_type reference_type ;
+
+  if (output_length < 1) return;
+
+  assert(output_result != nullptr);
+  const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
+
+  // allocate host and device memory for the results from each team
+  std::vector<T> result_cpu(td.num_tiles*output_length);
+  hc::array<T> result(td.num_tiles*output_length);
+
+  auto fut = tile_for<T[]>(td, [=,&result](hc::tiled_index<1> t_idx, tile_buffer<T[]> buffer) [[hc]] 
+  {
+      const auto local = t_idx.local[0];
+      const auto global = t_idx.global[0];
+      const auto tile = t_idx.tile[0];
+
+      buffer.action_at(local, [&](T* state)
+      {
+          ValueInit::init(ReducerConditional::select(f, reducer), state);
+          invoke(make_rocm_invoke_fn<Tag>(f), t_idx, td, reduce_value(state, std::is_pointer<reference_type>()));
+      });
+      t_idx.barrier.wait();
+
+      // Reduce within a tile using multiple threads.
+// even though buffer.size is always 64, the value 64 must be hard coded below
+// due to a compiler bug
+//      for(std::size_t s = 1; s < buffer.size(); s *= 2)
+      for(std::size_t s = 1; s < 64; s *= 2)
+      {
+          const std::size_t index = 2 * s * local;
+//          if (index < buffer.size())
+          if (index < 64)
+          {
+              buffer.action_at(index, index + s, [&](T* x, T* y)
+              {
+                  ValueJoin::join(ReducerConditional::select(f, reducer), x, y);
+              });
+          }
+          t_idx.barrier.wait();
+      }
+
+      // Store the tile result in the global memory.
+      if (local == 0)
+      {
+#if KOKKOS_ROCM_HAS_WORKAROUNDS
+          // Workaround for assigning from LDS memory: std::copy should work
+          // directly
+          buffer.action_at(0, [&](T* x)
+          {
+#if ROCM15
+// new ROCM 15 address space changes aren't implemented in std algorithms yet
+              auto * src = reinterpret_cast<char *>(x);
+              auto * dest = reinterpret_cast<char *>(result.data()+tile*output_length);
+              for(int i=0; i<sizeof(T);i++) dest[i] = src[i];
+#else
+              // Workaround: copy_if used to avoid memmove
+              std::copy_if(x, x+output_length, result.data()+tile*output_length, always_true{} );
+#endif
+          });
+#else
+          std::copy(buffer, buffer+output_length, result.data()+tile*output_length);
+
+#endif
+      }
+      
+  });
+  ValueInit::init(ReducerConditional::select(f, reducer), output_result);
+  fut.wait();
+
+  copy(result,result_cpu.data());
+  for(std::size_t i=0;i<td.num_tiles;i++)
+    ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
+
+  ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
+
+}
+
+}} //end of namespace Kokkos::Impl
+
+#endif /* #if !defined( KOKKOS_ROCM_AMP_REDUCE_INL ) */
+
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_ReduceScan.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_ReduceScan.hpp
@ -0,0 +1,605 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ROCM_REDUCESCAN_HPP
+#define KOKKOS_ROCM_REDUCESCAN_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if ROCM is enabled for Kokkos */
+#if defined( __HCC__ ) && defined( KOKKOS_ENABLE_ROCM )
+
+//#include <utility>
+
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <ROCm/Kokkos_ROCm_Vectorization.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl( *reinterpret_cast<int const *>(&in) , lane , width );
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl( reinterpret_cast<int const *>(&in)[i] , lane , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl_down( *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl_down( reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl_up( *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl_up( reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
+#if 0
+//----------------------------------------------------------------------------
+/** \brief  Reduce within a workgroup over team.vector_length(), the "vector" dimension.
+ *
+ *  This will be called within a nested, intra-team parallel operation.
+ *  Use shuffle operations to avoid conflicts with shared memory usage.
+ *
+ *  Requires:
+ *    team.vector_length() is power of 2
+ *    team.vector_length() <= 32 (one workgroup)
+ *
+ *  Cannot use "butterfly" pattern because floating point
+ *  addition is non-associative.  Therefore, must broadcast
+ *  the final result.
+ */
+template< class Reducer >
+KOKKOS_INLINE_FUNCTION
+void rocm_intra_workgroup_vector_reduce( Reducer const & reducer )
+{
+  static_assert(
+    std::is_reference< typename Reducer::reference_type >::value , "" );
+
+  if ( 1 < team.vector_length() ) {
+
+    typename Reducer::value_type tmp ;
+
+    for ( int i = team.vector_length() ; ( i >>= 1 ) ; ) {
+
+      rocm_shfl_down( tmp , reducer.reference() , i , team.vector_length() );
+
+      if ( team.vector_rank() < i ) { reducer.join( reducer.data() , & tmp ); }
+    }
+
+    // Broadcast from root "lane" to all other "lanes"
+
+    rocm_shfl( reducer.reference() , reducer.reference() , 0 , team.vector_length() );
+  }
+}
+
+/** \brief  Inclusive scan over team.vector_length(), the "vector" dimension.
+ *
+ *  This will be called within a nested, intra-team parallel operation.
+ *  Use shuffle operations to avoid conflicts with shared memory usage.
+ *
+ *  Algorithm is concurrent bottom-up reductions in triangular pattern
+ *  where each ROCM thread is the root of a reduction tree from the
+ *  zeroth ROCM thread to itself.
+ *
+ *  Requires:
+ *    team.vector_length() is power of 2
+ *    team.vector_length() <= 32 (one workgroup)
+ */
+template< typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void rocm_intra_workgroup_vector_inclusive_scan( ValueType & local )
+{
+  ValueType tmp ;
+
+  // Bottom up:
+  //   [t] += [t-1] if t >= 1
+  //   [t] += [t-2] if t >= 2
+  //   [t] += [t-4] if t >= 4
+  // ...
+
+  for ( int i = 1 ; i < team.vector_length() ; i <<= 1 ) {
+
+    rocm_shfl_up( tmp , local , i , team.vector_length() );
+
+    if ( i <= team.vector_rank() ) { local += tmp ; }
+  }
+}
+#endif
+
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) threads with same team.team_rank() have same value
+ *   (b) team.vector_length() == power of two
+ *   (c) blockDim.z == 1
+ */
+
+template< class ValueType , class JoinOp>
+KOKKOS_INLINE_FUNCTION
+void rocm_intra_workgroup_reduction( const ROCmTeamMember& team, 
+                                       ValueType& result,
+                                       const JoinOp& join) {
+
+  unsigned int shift = 1;
+  int max_active_thread = team.team_size();
+
+  //Reduce over values from threads with different team.team_rank()
+  while(team.vector_length() * shift < 32 ) {
+    const ValueType tmp = shfl_down(result, team.vector_length()*shift,32u);
+    //Only join if upper thread is active (this allows non power of two for team.team_size()
+    if(team.team_rank() + shift < max_active_thread)
+      join(result , tmp);
+    shift*=2;
+  }
+
+  result = shfl(result,0,32);
+}
+
+template< class ValueType , class JoinOp>
+KOKKOS_INLINE_FUNCTION
+void rocm_inter_workgroup_reduction( const ROCmTeamMember& team,
+                                       ValueType& value,
+                                       const JoinOp& join) {
+
+  #define STEP_WIDTH 4
+  
+  tile_static ValueType sh_result[256];
+  int max_active_thread = team.team_size();
+  ValueType* result = (ValueType*) & sh_result;
+  const unsigned step = 256 / team.vector_length();
+  unsigned shift = STEP_WIDTH;
+  const int id = team.team_rank()%step==0?team.team_rank()/step:65000;
+  if(id < STEP_WIDTH ) {
+    result[id] = value;
+  }
+  team.team_barrier();
+
+  while (shift<=max_active_thread/step) {
+    if(shift<=id && shift+STEP_WIDTH>id && team.vector_rank()==0) {
+      join(result[id%STEP_WIDTH],value);
+    }
+    team.team_barrier();
+    shift+=STEP_WIDTH;
+  }
+
+
+  value = result[0];
+  for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
+    join(value,result[i]);
+}
+
+#if 0
+template< class ValueType , class JoinOp>
+KOKKOS_INLINE_FUNCTION
+void rocm_intra_block_reduction( ROCmTeamMember& team,
+                                        ValueType& value,
+                                        const JoinOp& join,
+                                        const int max_active_thread) {
+  rocm_intra_workgroup_reduction(team,value,join,max_active_thread);
+  rocm_inter_workgroup_reduction(team,value,join,max_active_thread);
+}
+
+template< class FunctorType , class JoinOp , class ArgTag = void >
+KOKKOS_INLINE_FUNCTION
+bool rocm_inter_block_reduction( ROCmTeamMember& team,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  value,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  neutral,
+                                 const JoinOp& join,
+                                 ROCm::size_type * const m_scratch_space,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
+                                 ROCm::size_type * const m_scratch_flags,
+                                 const int max_active_thread) {
+#ifdef __ROCM_ARCH__
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
+
+  //Do the intra-block reduction with shfl operations and static shared memory
+  rocm_intra_block_reduction(value,join,max_active_thread);
+
+  const unsigned id = team.team_rank()*team.vector_length() + team.vector_rank();
+
+  //One thread in the block writes block result to global scratch_memory
+  if(id == 0 ) {
+    pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
+    *global = value;
+  }
+
+  //One workgroup of last block performs inter block reduction through loading the block values from global scratch_memory
+  bool last_block = false;
+
+  team.team_barrier();
+  if ( id < 32 ) {
+    ROCm::size_type count;
+
+    //Figure out whether this is the last block
+    if(id == 0)
+      count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
+    count = Kokkos::shfl(count,0,32);
+
+    //Last block does the inter block reduction
+    if( count == gridDim.x - 1) {
+      //set flag back to zero
+      if(id == 0)
+        *m_scratch_flags = 0;
+      last_block = true;
+      value = neutral;
+
+      pointer_type const volatile global = (pointer_type) m_scratch_space ;
+
+      //Reduce all global values with splitting work over threads in one workgroup
+      const int step_size = team.vector_length()*team.team_size() < 32 ? team.vector_length()*team.team_size() : 32;
+      for(int i=id; i<gridDim.x; i+=step_size) {
+        value_type tmp = global[i];
+        join(value, tmp);
+      }
+
+      //Perform shfl reductions within the workgroup only join if contribution is valid (allows gridDim.x non power of two and <32)
+      if (team.vector_length()*team.team_size() > 1) {
+        value_type tmp = Kokkos::shfl_down(value, 1,32);
+        if( id + 1 < gridDim.x )
+          join(value, tmp);
+      }
+      if (team.vector_length()*team.team_size() > 2) {
+        value_type tmp = Kokkos::shfl_down(value, 2,32);
+        if( id + 2 < gridDim.x )
+          join(value, tmp);
+      }
+      if (team.vector_length()*team.team_size() > 4) {
+        value_type tmp = Kokkos::shfl_down(value, 4,32);
+        if( id + 4 < gridDim.x )
+          join(value, tmp);
+      }
+      if (team.vector_length()*team.team_size() > 8) {
+        value_type tmp = Kokkos::shfl_down(value, 8,32);
+        if( id + 8 < gridDim.x )
+          join(value, tmp);
+      }
+      if (team.vector_length()*team.team_size() > 16) {
+        value_type tmp = Kokkos::shfl_down(value, 16,32);
+        if( id + 16 < gridDim.x )
+          join(value, tmp);
+      }
+    }
+  }
+
+  //The last block has in its thread=0 the global reduction value through "value"
+  return last_block;
+#else
+  return true;
+#endif
+}
+#endif
+#if 0
+
+//----------------------------------------------------------------------------
+// See section B.17 of ROCm C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize shared memory and minimize L1 cache:
+//   rocmFuncSetCacheConfig(MyKernel, rocmFuncCachePreferShared );
+// For 2.0 capability: 48 KB shared and 16 KB L1
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) team.team_size() is a power of two
+ *   (b) team.team_size() <= 512
+ *   (c) team.vector_length() == blockDim.z == 1
+ */
+
+template< bool DoScan , class FunctorType , class ArgTag >
+KOKKOS_INLINE_FUNCTION
+void rocm_intra_block_reduce_scan( const FunctorType & functor ,
+                                   const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
+{
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type  pointer_type ;
+
+  const unsigned value_count   = ValueTraits::value_count( functor );
+  const unsigned BlockSizeMask = team.team_size() - 1 ;
+
+  // Must have power of two thread count
+
+  if ( BlockSizeMask & team.team_size() ) { Kokkos::abort("ROCm::rocm_intra_block_scan requires power-of-two blockDim"); }
+
+#define BLOCK_REDUCE_STEP( R , TD , S )  \
+  if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
+
+#define BLOCK_SCAN_STEP( TD , N , S )  \
+  if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
+
+  const unsigned     rtid_intra = team.team_rank() ^ BlockSizeMask ;
+  const pointer_type tdata_intra = base_data + value_count * team.team_rank() ;
+
+  { // Intra-workgroup reduction:
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
+  }
+
+  team.team_barrier(); // Wait for all workgroups to reduce
+
+  { // Inter-workgroup reduce-scan by a single workgroup to avoid extra synchronizations
+    const unsigned rtid_inter = ( team.team_rank() ^ BlockSizeMask ) << ROCmTraits::WarpIndexShift ;
+
+    if ( rtid_inter < team.team_size() ) {
+
+      const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
+
+      if ( (1<<5) < BlockSizeMask ) {                        BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
+      if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
+      if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
+      if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
+
+      if ( DoScan ) {
+
+        int n = ( rtid_inter &  32 ) ?  32 : (
+                ( rtid_inter &  64 ) ?  64 : (
+                ( rtid_inter & 128 ) ? 128 : (
+                ( rtid_inter & 256 ) ? 256 : 0 )));
+
+        if ( ! ( rtid_inter + n < team.team_size() ) ) n = 0 ;
+
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,8)
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,7)
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,6)
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,5)
+      }
+    }
+  }
+
+  team.team_barrier(); // Wait for inter-workgroup reduce-scan to complete
+
+  if ( DoScan ) {
+    int n = ( rtid_intra &  1 ) ?  1 : (
+            ( rtid_intra &  2 ) ?  2 : (
+            ( rtid_intra &  4 ) ?  4 : (
+            ( rtid_intra &  8 ) ?  8 : (
+            ( rtid_intra & 16 ) ? 16 : 0 ))));
+
+    if ( ! ( rtid_intra + n < team.team_size() ) ) n = 0 ;
+    #ifdef KOKKOS_IMPL_ROCM_CLANG_WORKAROUND
+    BLOCK_SCAN_STEP(tdata_intra,n,4) team.team_barrier();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) team.team_barrier();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) team.team_barrier();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) team.team_barrier();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0) team.team_barrier();
+    #else
+    BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0) __threadfence_block();
+    #endif
+  }
+
+#undef BLOCK_SCAN_STEP
+#undef BLOCK_REDUCE_STEP
+}
+
+//----------------------------------------------------------------------------
+/**\brief  Input value-per-thread starting at 'shared_data'.
+ *         Reduction value at last thread's location.
+ *
+ *  If 'DoScan' then write blocks' scan values and block-groups' scan values.
+ *
+ *  Global reduce result is in the last threads' 'shared_data' location.
+ */
+template< bool DoScan , class FunctorType , class ArgTag >
+KOKKOS_INLINE_FUNCTION
+bool rocm_single_inter_block_reduce_scan( const FunctorType     & functor ,
+                                          const ROCm::size_type   block_id ,
+                                          const ROCm::size_type   block_count ,
+                                          ROCm::size_type * const shared_data ,
+                                          ROCm::size_type * const global_data ,
+                                          ROCm::size_type * const global_flags )
+{
+  typedef ROCm::size_type                  size_type ;
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+  typedef FunctorValueInit<   FunctorType , ArgTag >  ValueInit ;
+  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+
+  // '__ffs' = position of the least significant bit set to 1.
+  // 'team.team_size()' is guaranteed to be a power of two so this
+  // is the integral shift value that can replace an integral divide.
+  const unsigned BlockSizeShift = __ffs( team.team_size() ) - 1 ;
+  const unsigned BlockSizeMask  = team.team_size() - 1 ;
+
+  // Must have power of two thread count
+  if ( BlockSizeMask & team.team_size() ) { Kokkos::abort("ROCm::rocm_single_inter_block_reduce_scan requires power-of-two blockDim"); }
+
+  const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+    word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
+
+  // Reduce the accumulation for the entire block.
+  rocm_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+  {
+    // Write accumulation total to global scratch space.
+    // Accumulation total is the last thread's data.
+    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
+    size_type * const global = global_data + word_count.value * block_id ;
+
+#if (__ROCM_ARCH__ < 500)
+    for ( size_type i = team.team_rank() ; i < word_count.value ; i += team.team_size() ) { global[i] = shared[i] ; }
+#else
+    for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
+#endif
+
+  }
+
+  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
+  // If this block is not the last block to contribute to this group then the block is done.
+    team.team_barrier();
+  const bool is_last_block =
+    ! team.team_reduce( team.team_rank() ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) ,Impl::JoinAdd<ValueType>());
+
+  if ( is_last_block ) {
+
+    const size_type b = ( long(block_count) * long(team.team_rank()) ) >> BlockSizeShift ;
+    const size_type e = ( long(block_count) * long( team.team_rank() + 1 ) ) >> BlockSizeShift ;
+
+    {
+      void * const shared_ptr = shared_data + word_count.value * team.team_rank() ;
+      reference_type shared_value = ValueInit::init( functor , shared_ptr );
+
+      for ( size_type i = b ; i < e ; ++i ) {
+        ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
+      }
+    }
+
+    rocm_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+    if ( DoScan ) {
+
+      size_type * const shared_value = shared_data + word_count.value * ( team.team_rank() ? team.team_rank() - 1 : team.team_size() );
+
+      if ( ! team.team_rank() ) { ValueInit::init( functor , shared_value ); }
+
+      // Join previous inclusive scan value to each member
+      for ( size_type i = b ; i < e ; ++i ) {
+        size_type * const global_value = global_data + word_count.value * i ;
+        ValueJoin::join( functor , shared_value , global_value );
+        ValueOps ::copy( functor , global_value , shared_value );
+      }
+    }
+  }
+
+  return is_last_block ;
+}
+
+// Size in bytes required for inter block reduce or scan
+template< bool DoScan , class FunctorType , class ArgTag >
+inline
+unsigned rocm_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
+{
+  return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
+}
+#endif 
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( __ROCMCC__ ) */
+#endif /* KOKKOS_ROCM_REDUCESCAN_HPP */
+
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
@ -0,0 +1,157 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <ROCm/Kokkos_ROCm_Invoke.hpp>
+#include <ROCm/Kokkos_ROCm_Join.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Tag, class F, class TransformIndex>
+void scan_enqueue(
+  const int len,
+  const F & f,
+  TransformIndex transform_index)
+{
+    typedef Kokkos::Impl::FunctorValueTraits< F, Tag>  ValueTraits;
+    typedef Kokkos::Impl::FunctorValueInit<   F, Tag>  ValueInit;
+    typedef Kokkos::Impl::FunctorValueJoin<   F, Tag>  ValueJoin;
+    typedef Kokkos::Impl::FunctorValueOps<    F, Tag>  ValueOps;
+
+    typedef typename ValueTraits::value_type    value_type;
+    typedef typename ValueTraits::pointer_type    pointer_type;
+    typedef typename ValueTraits::reference_type  reference_type;
+
+    const auto td = get_tile_desc<value_type>(len);
+    std::vector<value_type> result_cpu(td.num_tiles);
+    hc::array<value_type> result(td.num_tiles);
+    hc::array<value_type> scratch(len);
+
+    tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]] 
+    {
+        const auto local = t_idx.local[0];
+        const auto global = t_idx.global[0];
+        const auto tile = t_idx.tile[0];
+
+        // Join tile buffer elements
+        const auto join = [&](std::size_t i, std::size_t j)
+        {
+            buffer.action_at(i, j, [&](value_type& x, const value_type& y)
+            {
+                ValueJoin::join(f, &x, &y);
+            });
+        };
+
+        // Copy into tile
+        buffer.action_at(local, [&](value_type& state)
+        {
+            ValueInit::init(f, &state);
+            if (global < len) rocm_invoke<Tag>(f, transform_index(t_idx, td.tile_size, td.num_tiles), state, false);
+        });
+        t_idx.barrier.wait();
+        // Up sweep phase
+        for(std::size_t d=1;d<buffer.size();d*=2)
+        {
+            auto d2 = 2*d;
+            auto i = local*d2;
+            if(i<len)
+            {
+               auto j = i + d - 1;
+               auto k = i + d2 - 1;
+//               join(k, j);  // no longer needed with ROCm 1.6
+               ValueJoin::join(f, &buffer[k], &buffer[j]);
+            }
+        }
+        t_idx.barrier.wait();
+
+        result[tile] = buffer[buffer.size()-1];
+        buffer[buffer.size()-1] = 0;
+        // Down sweep phase
+        for(std::size_t d=buffer.size()/2;d>0;d/=2)
+        {
+            auto d2 = 2*d;
+            auto i = local*d2;
+            if(i<len)
+            {
+               auto j = i + d - 1;
+               auto k = i + d2 - 1;
+               auto t = buffer[k];
+//               join(k, j);  // no longer needed with ROCm 1.6
+               ValueJoin::join(f, &buffer[k], &buffer[j]);
+               buffer[j] = t;
+            }
+            t_idx.barrier.wait();
+        }
+        // Copy tiles into global memory
+        if (global < len) scratch[global] = buffer[local];
+    }).wait();
+    copy(result,result_cpu.data());
+
+//  The std::partial_sum was segfaulting, despite that this is cpu code.
+//   if(td.num_tiles>1)
+//      std::partial_sum(result_cpu.data(), result_cpu.data()+(td.num_tiles-1)*sizeof(value_type), result_cpu.data(), make_join_operator<ValueJoin>(f));
+// use this implementation instead.
+   for(int i=1; i<td.num_tiles; i++)
+      ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
+
+    copy(result_cpu.data(),result);
+    hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]] 
+    {
+//        const auto local = t_idx.local[0];
+        const auto global = t_idx.global[0];
+        const auto tile = t_idx.tile[0];
+
+        if (global < len) 
+        {
+            auto final_state = scratch[global];
+
+// the join is locking up, at least with 1.6
+            if (tile != 0) final_state += result[tile-1];
+//            if (tile != 0) ValueJoin::join(f, &final_state, &result[tile-1]);
+            rocm_invoke<Tag>(f, transform_index(t_idx, td.tile_size, td.num_tiles), final_state, true);
+        }
+    }).wait();
+}
+
+} // namespace Impl
+} // namespace Kokkos
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Space.cpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Space.cpp
@ -0,0 +1,726 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <algorithm>
+#include <atomic>
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if ROCM is enabled for Kokkos */
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_ROCm.hpp>
+#include <Kokkos_ROCmSpace.hpp>
+
+#include <impl/Kokkos_Error.hpp>
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#endif
+
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+#define ROCM_SAFE_CALL
+namespace Kokkos {
+namespace Impl {
+using namespace hc;
+
+DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+
+DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+
+
+DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+
+DeepCopy<HostSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<HostSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+
+hc::completion_future DeepCopyAsyncROCm( void * dst , const void * src , size_t n) {
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   return(av.copy_async( src , dst , n));
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+
+namespace Kokkos {
+
+void Experimental::ROCmSpace::access_error()
+{
+  const std::string msg("Kokkos::Experimental::ROCmSpace::access_error attempt to execute Experimental::ROCm function from non-ROCm space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+void Experimental::ROCmSpace::access_error( const void * const )
+{
+  const std::string msg("Kokkos::Experimental::ROCmSpace::access_error attempt to execute Experimental::ROCm function from non-ROCm space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+
+ROCmSpace::ROCmSpace()
+  : m_device( ROCm().rocm_device() )
+{
+}
+
+ROCmHostPinnedSpace::ROCmHostPinnedSpace()
+{
+}
+
+void * ROCmSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr =  Kokkos::Impl::rocm_device_allocate( arg_alloc_size );
+  return ptr ;
+}
+
+void * Experimental::ROCmHostPinnedSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr =  Kokkos::Impl::rocm_hostpinned_allocate( arg_alloc_size );
+  return ptr ;
+}
+
+void ROCmSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  Kokkos::Impl::rocm_device_free(arg_alloc_ptr);
+}
+
+void Experimental::ROCmHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  Kokkos::Impl::rocm_device_free(arg_alloc_ptr);
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::s_root_record ;
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::s_root_record ;
+
+
+std::string
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_label() const
+{
+  SharedAllocationHeader header ;
+
+  Kokkos::Impl::DeepCopy< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace >( & header , RecordBase::head() , sizeof(SharedAllocationHeader) );
+
+  return std::string( header.m_label );
+}
+
+std::string
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_label() const
+{
+  return std::string( RecordBase::head()->m_label );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+allocate( const Kokkos::Experimental::ROCmSpace &  arg_space
+        , const std::string       &  arg_label
+        , const size_t               arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+allocate( const Kokkos::Experimental::ROCmHostPinnedSpace &  arg_space
+        , const std::string                 &  arg_label
+        , const size_t                         arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+void
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+void
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+~SharedAllocationRecord()
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+
+    SharedAllocationHeader header ;
+    Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace>( & header , RecordBase::m_alloc_ptr , sizeof(SharedAllocationHeader) );
+
+    Kokkos::Profiling::deallocateData(
+      Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::ROCmSpace::name()),header.m_label,
+      data(),size());
+  }
+  #endif
+
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+~SharedAllocationRecord()
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::deallocateData(
+      Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::ROCmHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
+      data(),size());
+  }
+  #endif
+
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+SharedAllocationRecord( const Kokkos::Experimental::ROCmSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
+  }
+  #endif
+
+  SharedAllocationHeader header ;
+
+  // Fill in the Header information
+  header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( header.m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+
+  // Copy to device memory
+  Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace>( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+SharedAllocationRecord( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
+                      , const std::string                 & arg_label
+                      , const size_t                        arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
+  }
+  #endif
+  // Fill in the Header information, directly accessible via host pinned memory
+
+  RecordBase::m_alloc_ptr->m_record = this ;
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+//----------------------------------------------------------------------------
+
+void * SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+allocate_tracked( const Kokkos::Experimental::ROCmSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+#if 0
+void * SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+allocate_tracked( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<Experimental::ROCmHostPinnedSpace,Experimental::ROCmHostPinnedSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordBase = SharedAllocationRecord< void , void > ;
+  using RecordROCm = SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > ;
+
+#if 0
+  // Copy the header from the allocation
+  Header head ;
+
+  Header const * const head_rocm = alloc_ptr ? Header::get_header( alloc_ptr ) : (Header*) 0 ;
+
+  if ( alloc_ptr ) {
+    Kokkos::Impl::DeepCopy<HostSpace,Experimental::ROCmSpace>( & head , head_rocm , sizeof(SharedAllocationHeader) );
+  }
+
+  RecordROCm * const record = alloc_ptr ? static_cast< RecordROCm * >( head.m_record ) : (RecordROCm *) 0 ;
+
+  if ( ! alloc_ptr || record->m_alloc_ptr != head_rocm ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record ERROR" ) );
+  }
+
+#else
+
+  // Iterate the list to search for the record among all allocations
+  // requires obtaining the root of the list and then locking the list.
+
+  RecordROCm * const record = static_cast< RecordROCm * >( RecordBase::find( & s_root_record , alloc_ptr ) );
+
+  if ( record == 0 ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record ERROR" ) );
+  }
+
+#endif
+
+  return record ;
+}
+
+#if  0
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordROCm = SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > ;
+
+  Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
+
+  if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_record ERROR" ) );
+  }
+
+  return static_cast< RecordROCm * >( h->m_record );
+}
+#endif 
+
+// Iterate records to print orphaned memory ...
+void
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+print_records( std::ostream & s , const Kokkos::Experimental::ROCmSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void > * r = & s_root_record ;
+
+  char buffer[256] ;
+
+  SharedAllocationHeader head ;
+
+  if ( detail ) {
+    do {
+      if ( r->m_alloc_ptr ) {
+        Kokkos::Impl::DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace>( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+      }
+      else {
+        head.m_label[0] = 0 ;
+      }
+
+      //Formatting dependent on sizeof(uintptr_t)
+      const char * format_string;
+
+      if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+        format_string = "ROCm addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
+      }
+      else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+        format_string = "ROCm addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
+      }
+
+      snprintf( buffer , 256 
+              , format_string
+              , reinterpret_cast<uintptr_t>( r )
+              , reinterpret_cast<uintptr_t>( r->m_prev )
+              , reinterpret_cast<uintptr_t>( r->m_next )
+              , reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
+              , r->m_alloc_size
+              , r->m_count
+              , reinterpret_cast<uintptr_t>( r->m_dealloc )
+              , head.m_label
+              );
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+  else {
+    do {
+      if ( r->m_alloc_ptr ) {
+
+        Kokkos::Impl::DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace>( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+
+        //Formatting dependent on sizeof(uintptr_t)
+        const char * format_string;
+
+        if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+          format_string = "ROCm [ 0x%.12lx + %ld ] %s\n";
+        }
+        else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+          format_string = "ROCm [ 0x%.12llx + %ld ] %s\n";
+        }
+
+        snprintf( buffer , 256 
+                , format_string
+                , reinterpret_cast< uintptr_t >( r->data() )
+                , r->size()
+                , head.m_label
+                );
+      }
+      else {
+        snprintf( buffer , 256 , "ROCm [ 0 + 0 ]\n" );
+      }
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+}
+#if 0
+void
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+print_records( std::ostream & s , const Kokkos::Experimental::ROCmHostPinnedSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "ROCmHostPinned" , & s_root_record , detail );
+}
+#endif 
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+namespace Kokkos {
+namespace {
+#if 0
+  KOKKOS_INLINE_FUNCTION void init_lock_array_kernel_atomic() {
+    unsigned i = tindex()*team_size() + lindex();
+
+    if(i<ROCM_SPACE_ATOMIC_MASK+1)
+      kokkos_impl_rocm_lock_arrays.atomic[i] = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION void init_lock_array_kernel_scratch_threadid(int N) {
+    unsigned i = tindex()*team_size() + lindex();
+
+    if(i<N) {
+      kokkos_impl_rocm_lock_arrays.scratch[i] = 0;
+      kokkos_impl_rocm_lock_arrays.threadid[i] = 0;
+    }
+  }
+}
+
+
+namespace Impl {
+int* atomic_lock_array_rocm_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    rocmFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    rocmMalloc(&ptr,sizeof(int)*(ROCM_SPACE_ATOMIC_MASK+1));
+  return ptr;
+}
+
+int* scratch_lock_array_rocm_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    rocmFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    rocmMalloc(&ptr,sizeof(int)*(ROCm::concurrency()));
+  return ptr;
+}
+
+int* threadid_lock_array_rocm_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    rocmFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    rocmMalloc(&ptr,sizeof(int)*(ROCm::concurrency()));
+  return ptr;
+}
+
+void init_lock_arrays_rocm_space() {
+  static int is_initialized = 0;
+  if(! is_initialized) {
+    Kokkos::Impl::ROCmLockArraysStruct locks;
+    locks.atomic = atomic_lock_array_rocm_space_ptr(false);
+    locks.scratch = scratch_lock_array_rocm_space_ptr(false);
+    locks.threadid = threadid_lock_array_rocm_space_ptr(false);
+    am_copyToSymbol( kokkos_impl_rocm_lock_arrays , & locks , sizeof(ROCmLockArraysStruct) );
+    init_lock_array_kernel_atomic<<<(ROCM_SPACE_ATOMIC_MASK+255)/256,256>>>();
+    init_lock_array_kernel_scratch_threadid<<<(Kokkos::Experimental::ROCm::concurrency()+255)/256,256>>>(Kokkos::Experimental::ROCm::concurrency());
+  }
+}
+#endif 
+
+void* rocm_resize_scratch_space(size_t bytes, bool force_shrink) {
+  static void* ptr = NULL;
+  static size_t current_size = 0;
+  if(current_size == 0) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::ROCmSpace>("ROCmSpace::ScratchMemory",current_size);
+  }
+  if(bytes > current_size) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_realloc<Kokkos::Experimental::ROCmSpace>(ptr,current_size);
+  }
+  if((bytes < current_size) && (force_shrink)) {
+    current_size = bytes;
+    Kokkos::kokkos_free<Kokkos::Experimental::ROCmSpace>(ptr);
+    ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::ROCmSpace>("ROCmSpace::ScratchMemory",current_size);
+  }
+  return ptr;
+}
+
+}
+}
+
+#endif // KOKKOS_ENABLE_ROCM
+
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.cpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.cpp
@ -0,0 +1,174 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_ROCM ) && defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Experimental::ROCm > ;
+
+
+//----------------------------------------------------------------------------
+KOKKOS_INLINE_FUNCTION
+void TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver
+  ( TaskQueueSpecialization< Kokkos::Experimental::ROCm >::queue_type * const queue,
+    hc::tiled_index<3> threadIdx )
+{
+  using Member = TaskExec< Kokkos::Experimental::ROCm > ;
+  using Queue  = TaskQueue< Kokkos::Experimental::ROCm > ;
+  using task_root_type = TaskBase< void , void , void > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  Member single_exec( 1, threadIdx );
+  Member team_exec( threadIdx.tile_dim[0], threadIdx );
+
+  const int wavefront_lane = threadIdx.local[0] + threadIdx.local[1]* threadIdx.tile_dim[0] ;
+
+  union {
+    task_root_type * ptr ;
+    int              raw[2] ;
+  } task ;
+
+  // Loop until all queues are empty and no tasks in flight
+
+  do {
+
+    // Each team lead attempts to acquire either a thread team task
+    // or collection of single thread tasks for the team.
+
+    if ( 0 == wavefront_lane ) {
+
+      task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
+          task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
+        }
+      }
+
+#if 0
+printf("TaskQueue<ROCm>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
+      , uintptr_t(task.ptr));
+#endif
+
+    }
+
+    // shuffle broadcast
+
+    task.raw[0] = hc::__shfl( task.raw[0] , 0 );
+    task.raw[1] = hc::__shfl( task.raw[1] , 0 );
+
+    if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
+
+    if ( end != task.ptr ) {
+      if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
+        // Thread Team Task
+        (*task.ptr->m_apply)( task.ptr , & team_exec );
+      }
+      else if ( 0 == threadIdx.local[1] ) {
+        // Single Thread Task
+        (*task.ptr->m_apply)( task.ptr , & single_exec );
+      }
+
+      if ( 0 == wavefront_lane ) {
+        queue->complete( task.ptr );
+      }
+    }
+  } while(1);
+}
+#if 0
+namespace {
+KOKKOS_INLINE_FUNCTION
+void rocm_task_queue_execute( TaskQueue< Kokkos::Experimental::ROCm > * queue, 
+                              hc::tiled_index<3> threadIdx )
+{ TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver( queue, threadIdx ); }
+
+}
+#endif
+void TaskQueueSpecialization< Kokkos::Experimental::ROCm >::execute
+  ( TaskQueue< Kokkos::Experimental::ROCm > * const queue )
+{
+  const int workgroups_per_wavefront = 4 ;
+  const int wavefront_size = Kokkos::Impl::ROCmTraits::WavefrontSize ;
+  const int cu_count = Kokkos::Impl::rocm_internal_cu_count();
+//  const dim3 grid( Kokkos::Impl::rocm_internal_cu_count() , 1 , 1 );
+//  const dim3 block( 1 , Kokkos::Impl::ROCmTraits::WorkGroupSize , workgroups_per_wavefront );
+
+
+
+  // Query the stack size, in bytes:
+  // If not large enough then set the stack size, in bytes:
+
+// adapted from the cuda code.  TODO: Not at all sure that this is the proper 
+// to map the cuda grid/blocks/3D tiling to HCC
+#if 0
+  hc::extent< 3 > flat_extent(  cu_count,
+                                wavefront_size, workgroups_per_wavefront );
+  hc::tiled_extent< 3 > team_extent = flat_extent.tile(1,
+                                wavefront_size,workgroups_per_wavefront);
+
+  hc::parallel_for_each( team_extent , [&](hc::tiled_index<3> idx) [[hc]]
+  {
+    TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver( queue,idx ); 
+  }).wait();
+#endif
+}
+
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_ROCM ) && defined( KOKKOS_ENABLE_TASKDAG ) */
+
+
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.hpp
@ -0,0 +1,458 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ROCM_TASK_HPP
+#define KOKKOS_IMPL_ROCM_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <ROCm/Kokkos_ROCm_Vectorization.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class > class TaskExec ; 
+
+template<>
+class TaskQueueSpecialization< Kokkos::Experimental::ROCm >
+{
+public:
+
+  using execution_space = Kokkos::Experimental::ROCm ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using member_type     = TaskExec< execution_space > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  static void driver( queue_type * const, hc::tiled_index<3> );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( typename TaskBase< Kokkos::Experimental::ROCm
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               >::function_type * ptr )
+    {
+      using TaskType = TaskBase< Kokkos::Experimental::ROCm
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               > ;
+      hc::extent< 1 > flat_extent( 1 );
+      hc::tiled_extent< 1 > team_extent = flat_extent.tile( 1);
+
+      hc::parallel_for_each( team_extent , [&](hc::tiled_index<1> idx) [[hc]]
+      {
+         *ptr = TaskType::apply ;
+      }).wait();
+    }
+};
+
+/*template<>
+KOKKOS_FUNCTION 
+void TaskQueue<Kokkos::Experimental::ROCm>::decrement( typename TaskQueue<Kokkos::Experimental::ROCm>::task_root_type *
+) {}
+*/
+extern template class TaskQueue< Kokkos::Experimental::ROCm > ;
+
+//----------------------------------------------------------------------------
+/**\brief  Impl::TaskExec<ROCm> is the TaskScheduler<ROCm>::member_type
+ *         passed to tasks running in a ROCm space.
+ *
+ *  ROCm thread blocks for tasking are dimensioned:
+ *    idx.tile_dim[0] == vector length
+ *    idx.tile_dim[1] == team size
+ *    idx.tile_dim[2] == number of teams
+ *  where
+ *    idx.tile_dim[0] * idx.tile_dim[1] == WavefrontSize
+ *
+ *  Both single thread and thread team tasks are run by a full ROCm warp.
+ *  A single thread task is called by warp lane #0 and the remaining
+ *  lanes of the warp are idle.
+ */
+template<>
+class TaskExec< Kokkos::Experimental::ROCm >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Experimental::ROCm > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Experimental::ROCm > ;
+
+  int              m_team_size ;
+  hc::tiled_index<3>      m_idx;
+
+//  KOKKOS_INLINE_FUNCTION TaskExec( int arg_team_size )  //TODO: tile_dim[0]
+//    : m_team_size( arg_team_size ) {}
+
+  KOKKOS_INLINE_FUNCTION TaskExec( int arg_team_size,
+                                   hc::tiled_index<3> tidx)  
+    : m_team_size( arg_team_size),
+      m_idx( tidx ) {}
+
+public:
+//      const auto local = t_idx.local[0];
+//      const auto global = t_idx.global[0];
+//     const auto tile = t_idx.tile[0];
+
+  hc::tiled_index<3> idx() const { return m_idx;}
+
+#if defined( __HCC_ACCELERATOR__ )
+  KOKKOS_INLINE_FUNCTION void team_barrier() { /* __threadfence_block(); */ }
+  KOKKOS_INLINE_FUNCTION int  team_rank() const { return m_idx.local[1] ; } // t_idx.tile[0];
+  KOKKOS_INLINE_FUNCTION int  team_size() const { return m_team_size ; }
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() {}
+  KOKKOS_INLINE_FUNCTION int  team_rank() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int  team_size() const { return 0 ; }
+#endif
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >
+TeamThreadRange
+  ( Impl::TaskExec< Kokkos::Experimental::ROCm > & thread, const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread,count);
+}
+
+template<typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
+                                       Impl::TaskExec< Kokkos::Experimental::ROCm > >
+TeamThreadRange
+  ( Impl:: TaskExec< Kokkos::Experimental::ROCm > & thread, const iType1 & begin, const iType2 & end )
+{
+  typedef typename std::common_type<iType1, iType2>::type iType;
+  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread, begin, end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >
+ThreadVectorRange
+  ( Impl::TaskExec< Kokkos::Experimental::ROCm > & thread
+  , const iType & count )
+{
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread,count);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+// reduce across corresponding lanes between team members within workgroup
+// assume stride*team_size == workgroup_size
+template< typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void strided_shfl_workgroup_reduction
+  (const ValueType& f(),
+   ValueType& val,
+   int team_size,
+   int stride)
+{
+  for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
+    f(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
+  }
+}
+
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void strided_shfl_workgroup_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int team_size,
+   int stride)
+{
+  for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
+    join(val, shfl_down(val, lane_delta, team_size*stride));
+  }
+}
+
+// multiple within-workgroup non-strided reductions
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void multi_shfl_workgroup_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int vec_length)
+{
+  for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
+    join(val, shfl_down(val, lane_delta, vec_length));
+  }
+}
+
+// broadcast within workgroup
+template< class ValueType >
+KOKKOS_INLINE_FUNCTION
+ValueType shfl_workgroup_broadcast
+  (ValueType& val,
+   int src_lane,
+   int width)
+{
+  return shfl(val, src_lane, width);
+}
+
+// all-reduce across corresponding vector lanes between team members within workgroup
+// assume vec_length*team_size == workgroup_size
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+
+template<typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries
+  , const Lambda& lambda
+  , ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+  hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+  initialized_result = result;
+
+  strided_shfl_workgroup_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          idx.tile_dim[0]);
+  initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, idx.local[0], Impl::ROCmTraits::WavefrontSize );
+
+}
+
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+   hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  strided_shfl_workgroup_reduction<ValueType, JoinType>(
+                          join,
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          idx.tile_dim[0]);
+  initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, idx.local[0], Impl::ROCmTraits::WavefrontSize );
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+  ValueType result = initialized_result;
+  hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  initialized_result = result;
+
+  //initialized_result = multi_shfl_workgroup_reduction(
+  multi_shfl_workgroup_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+                          initialized_result,
+                          idx.tile_dim[0]);
+  initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, 0, idx.tile_dim[0] );
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+  hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  multi_shfl_workgroup_reduction<ValueType, JoinType>(join, initialized_result, idx.tile_dim[0]);
+  initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, 0, idx.tile_dim[0] );
+}
+
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+  ValueType accum = 0 ;
+  ValueType val, y, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    val = 0;
+    lambda(i,val,false);
+
+    // intra-idx.tile_dim[0] exclusive scan on 'val'
+    // accum = accumulated, sum in total for this iteration
+
+    // INCLUSIVE scan
+    for( int offset = idx.tile_dim[0] ; offset < Impl::ROCmTraits::WavefrontSize ; offset <<= 1 ) {
+      y = shfl_up(val, offset, Impl::ROCmTraits::WavefrontSize);
+      if(idx.local[1]*idx.tile_dim[0] >= offset) { val += y; }
+    }
+
+    // pass accum to all threads
+    local_total = shfl_workgroup_broadcast<ValueType>(val,
+                                            idx.local[0]+Impl::ROCmTraits::WavefrontSize-idx.tile_dim[0],
+                                            Impl::ROCmTraits::WavefrontSize);
+
+    // make EXCLUSIVE scan by shifting values over one
+    val = shfl_up(val, idx.tile_dim[0], Impl::ROCmTraits::WavefrontSize);
+    if ( idx.local[1] == 0 ) { val = 0 ; }
+
+    val += accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+  ValueType accum = 0 ;
+  ValueType val, y, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    val = 0;
+    lambda(i,val,false);
+
+    // intra-idx.tile_dim[0] exclusive scan on 'val'
+    // accum = accumulated, sum in total for this iteration
+
+    // INCLUSIVE scan
+    for( int offset = 1 ; offset < idx.tile_dim[0] ; offset <<= 1 ) {
+      y = shfl_up(val, offset, idx.tile_dim[0]);
+      if(idx.local[0] >= offset) { val += y; }
+    }
+
+    // pass accum to all threads
+    local_total = shfl_workgroup_broadcast<ValueType>(val, idx.tile_dim[0]-1, 
+                                                 idx.tile_dim[0]);
+
+    // make EXCLUSIVE scan by shifting values over one
+    val = shfl_up(val, 1, idx.tile_dim[0]);
+    if ( idx.local[0] == 0 ) { val = 0 ; }
+
+    val += accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+
+} /* namespace Kokkos */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
+#endif /* #ifndef KOKKOS_IMPL_ROCM_TASK_HPP */
+
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Tile.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Tile.hpp
@ -0,0 +1,518 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <hc.hpp>
+#include <type_traits>
+#include <vector>
+#include <memory>
+#include <ROCm/Kokkos_ROCm_Config.hpp>
+
+#if !defined( KOKKOS_ROCM_TILE_H )
+#define KOKKOS_ROCM_TILE_H
+
+// Macro to abstract out the enable_if craziness
+#define KOKKOS_ROCM_REQUIRES(...) \
+    bool KokkosROCmRequiresBool ## __LINE__ = true, typename std::enable_if<KokkosROCmRequiresBool ## __LINE__ && (__VA_ARGS__), int>::type = 0
+
+// This number uniquely identifies the 1.5 release build.  
+#if __hcc_workweek__ > 17160
+#define ROCM15 1
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+template<class T>
+
+#if defined(ROCM15)
+using lds_t = T;
+#else
+// prior to 1.5, needed to decorate LDS addresses
+using lds_t = __attribute__((address_space(3))) T;
+#endif
+
+#define KOKKOS_ROCM_TILE_RESTRIC_CPU restrict(cpu, amp)
+
+// a set of routines to the replace the std::routines
+// that will operate on address space 3 types
+
+#if defined(ROCM15)
+// 1.5 can't use std::copy et al for LDS access, so we define our own 
+// set of routines
+template<class I, class O>
+void rcopy(I first, I last, O out) [[hc]]
+{
+    while (first != last) *out++ = *first++;
+}
+template<class I,class F>
+void rfor_each(I first, I last, F f) [[hc]]
+{
+  for(;first!=last;++first) f(*first);
+}
+
+template<class I,class O,class F>
+void rtransform(I first, I last, O out, F f) [[hc]]
+{
+  while(first!=last) *out++ = f(*first++);
+}
+#endif
+
+
+inline std::size_t get_max_tile_size() KOKKOS_ROCM_TILE_RESTRIC_CPU
+{
+    return hc::accelerator().get_max_tile_static_size() - 1024;
+}
+
+inline std::size_t get_max_tile_thread() KOKKOS_ROCM_TILE_RESTRIC_CPU
+{
+    return 64;
+}
+
+inline int next_pow_2(int x) restrict(cpu, amp)
+{ 
+    --x;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    return x+1;
+}
+
+template<class T>
+inline std::size_t get_tile_size(std::size_t n = 1,
+                                 std::size_t team = 64, 
+                                 std::size_t vector = 1) 
+                                 KOKKOS_ROCM_TILE_RESTRIC_CPU
+{
+    const auto size = sizeof(T) * n;
+    const auto group_size = get_max_tile_size();
+    if (size == 0 || size > group_size) return 0;
+    // Assume that thread size is a power of 2
+    auto thread_size = std::min(team*vector,4*get_max_tile_thread());
+    // ensure that we have enough tile static memory to keep
+    // threadsize * size elements for reductions
+    while(size > (group_size / thread_size) && thread_size > 2)
+{ thread_size /= 2;
+}
+    return thread_size;
+}
+
+template<class T>
+struct array_view
+{
+    T* x;
+    std::size_t n;
+
+    array_view(T* xp, std::size_t np) [[hc]] [[cpu]] 
+    : x(xp), n(np)
+    {}
+
+    array_view(T* xp, T* yp) [[hc]] [[cpu]] 
+    : x(xp), n(yp-xp)
+    {}
+
+    T& operator[](std::size_t i) const [[hc]] [[cpu]]
+    {
+        return x[i];
+    }
+
+    std::size_t size() const [[hc]] [[cpu]]
+    {
+        return this->n;
+    }
+
+    T* data() const [[hc]] [[cpu]]
+    {
+        return x;
+    }
+
+    T* begin() const [[hc]] [[cpu]]
+    {
+        return x;
+    }
+
+    T* end() const [[hc]] [[cpu]]
+    {
+        return x+this->size();
+    }
+};
+
+template<class T>
+struct rocm_char
+{ using type=char; };
+
+template<class T>
+struct rocm_char<const T>
+: std::add_const<typename rocm_char<T>::type>
+{};
+#if !defined(ROCM15)
+// earlier compilers required explicit address space decorations
+template<class T>
+struct rocm_char<__attribute__((address_space(3))) T>
+{ using type = __attribute__((address_space(3))) typename rocm_char<T>::type; };
+
+template<class T>
+struct rocm_char<const __attribute__((address_space(3))) T>
+{ using type = const __attribute__((address_space(3))) typename rocm_char<T>::type; };
+#endif
+
+template<class T, class Char=typename rocm_char<T>::type>
+Char* rocm_byte_cast(T& x) restrict(cpu, amp)
+{
+    return reinterpret_cast<Char*>(&x);
+}
+
+template<class T, class U>
+void rocm_raw_assign(T& x, const U& y) restrict(cpu, amp)
+{
+    auto * src = rocm_byte_cast(y);
+    auto * dest = rocm_byte_cast(x);
+#if defined (ROCM15)
+    rcopy(src, src+sizeof(T), dest);
+#else
+    std::copy(src, src+sizeof(T), dest);
+#endif
+}
+
+template<class T, class U>
+void rocm_assign_impl(T& x, const U& y, std::true_type) restrict(cpu, amp)
+{
+    rocm_raw_assign(x, y);
+}
+
+template<class T, class U>
+void rocm_assign_impl(T& x, const U& y, std::false_type) restrict(cpu, amp)
+{
+    x = y;
+}
+
+// Workaround for assigning in and out of LDS memory
+template<class T, class U>
+void rocm_assign(T& x, const U& y) restrict(cpu, amp)
+{
+    rocm_assign_impl(x, y, std::integral_constant<bool, (
+        sizeof(T) == sizeof(U)
+    )>());
+}
+
+// Compute the address space of tile
+template<class T>
+struct tile_type
+{
+#if defined (ROCM15)
+    typedef T type;
+#else
+    typedef __attribute__((address_space(3))) T type;
+#endif
+};
+
+#if !defined (ROCM15)
+template<class T, class Body>
+void lds_for(__attribute__((address_space(3))) T& value, Body b) [[hc]]
+{
+    T state = value;
+    b(state);
+    value = state;
+}
+#endif
+
+template<class T, class Body>
+void lds_for(T& value, Body b) [[hc]]
+{
+    b(value);
+}
+
+
+constexpr std::size_t get_max_tile_array_size()
+{
+    return 24;
+}
+
+template<class Derived, class T>
+struct single_action
+{
+    template<class Action>
+    void action_at(std::size_t i, Action a) [[hc]]
+    {
+        auto& value = static_cast<Derived&>(*this)[i];
+#if KOKKOS_ROCM_HAS_WORKAROUNDS
+        T state = value;
+        a(state);
+        value = state;
+#else
+        a(value);
+#endif
+    }
+
+    template<class Action>
+    void action_at(std::size_t i, std::size_t j, Action a) [[hc]]
+    {
+        static_cast<Derived&>(*this).action_at(i, [&](T& x)
+        {
+            static_cast<Derived&>(*this).action_at(j, [&](T& y)
+            {
+                a(x, y);
+            });
+        });
+    }
+};
+
+template<class T>
+struct tile_buffer
+: array_view<typename tile_type<T>::type>, single_action<tile_buffer<T>, T>
+{
+    typedef typename tile_type<T>::type element_type;
+    typedef array_view<element_type> base;
+
+    using base::base;
+
+    tile_buffer(element_type* xp, std::size_t np, std::size_t) [[hc]] [[cpu]] 
+    : base(xp, np)
+    {}
+
+    tile_buffer(T* xp, T* yp, std::size_t) [[hc]] [[cpu]] 
+    : base(xp, yp)
+    {}
+};
+
+template<class T>
+struct tile_buffer<T[]>
+{
+    typedef typename tile_type<T>::type element_type;
+    typedef typename tile_type<char>::type tchar_type;
+    element_type* element_data;
+    std::size_t n, m;
+
+    tile_buffer(element_type* xp, std::size_t np, std::size_t mp) [[hc]] [[cpu]] 
+    : element_data(xp), n(np), m(mp)
+    {}
+
+    tile_buffer(element_type* xp, element_type* yp, std::size_t mp) [[hc]] [[cpu]] 
+    : element_data(xp), n(yp-xp), m(mp)
+    {}
+
+    element_type* operator[](std::size_t i) const [[hc]] [[cpu]]
+    {
+        return element_data+i*m;
+    }
+
+    template<class Action, class Q = T>
+    typename Impl::enable_if< (sizeof(Q) <= 8) , void >::type
+    action_at(std::size_t i, Action a) [[hc]]
+    {
+        element_type* value = (*this)[i];
+#if defined (ROCM15)
+        a(value);
+#else
+#if KOKKOS_ROCM_HAS_WORKAROUNDS
+        if (m > get_max_tile_array_size()) return;
+        T state[get_max_tile_array_size()];
+        // std::copy(value, value+m, state);
+        // Workaround for assigning from LDS memory
+        std::transform(value, value+m, state, [](element_type& x)
+        {
+          T result;
+          rocm_assign(result, x);
+          return result;
+        });
+        a(state);
+        std::copy(state, state+m, value);
+#endif
+#endif
+    }
+
+    template<class Action, class Q = T>
+    typename Impl::enable_if< !(sizeof(Q) <= 8) , void >::type
+    action_at(std::size_t i, Action a) [[hc]]
+    {
+        element_type* value = (*this)[i];
+#if defined (ROCM15)
+        a(value);
+#else
+//#if KOKKOS_ROCM_HAS_WORKAROUNDS
+        if (m > get_max_tile_array_size()) return;
+        T state[get_max_tile_array_size()];
+        // std::copy(value, value+m, state);
+        // Workaround for assigning from LDS memory
+        std::transform(value, value+m, state, [](element_type& x)
+        {
+          T result;
+          rocm_assign(result, x);
+          return result;
+        });
+        a(state);
+        // this workaround required when T is greater than 8 bytes
+        tile_static char tv[64*sizeof(T)];
+        size_t sT = sizeof(T);
+        for (int j = 0; j<sT; j++) tv[i*sT+j] = ((char *)state)[j];
+        for (int j = 0; j<sT; j++) ((tchar_type *)value)[j] = tv[i*sT+j];
+#endif
+    }
+
+    template<class Action>
+    void action_at(std::size_t i, std::size_t j, Action a) [[hc]]
+    {
+        this->action_at(i, [&](T* x)
+        {
+            this->action_at(j, [&](T* y)
+            {
+                a(x, y);
+            });
+        });
+    }
+
+    std::size_t size() const [[hc]] [[cpu]]
+    {
+        return this->n;
+    }
+
+    element_type* data() const [[hc]] [[cpu]]
+    {
+        return element_data;
+    }
+};
+
+// Zero initialize LDS memory
+struct zero_init_f
+{
+    template<class T>
+#if defined (ROCM15)
+    void operator()(T& x, std::size_t=1) const [[hc]]
+    {
+        auto * start = reinterpret_cast<char*>(&x);
+        for(int i=0; i<sizeof(T);i++) start[i] = 0;
+        rocm_raw_assign(x, T());
+    }
+#else
+    void operator()(__attribute__((address_space(3))) T& x, std::size_t=1) const [[hc]]
+    {
+        auto * start = reinterpret_cast<__attribute__((address_space(3))) char*>(&x);
+        std::fill(start, start+sizeof(T), 0);
+        rocm_raw_assign(x, T());
+    }
+#endif
+
+    template<class T>
+#if defined (ROCM15)
+    void operator()(T* x, std::size_t size) const [[hc]]
+    {
+        rfor_each(x, x+size, *this);
+    }
+#else
+    void operator()(__attribute__((address_space(3))) T* x, std::size_t size) const [[hc]]
+    {
+        std::for_each(x, x+size, *this);
+    }
+#endif
+};
+
+static constexpr zero_init_f zero_init = {};
+
+struct tile_desc
+{
+    // Number of work items, or size of extent
+    std::size_t elements;
+    // number of threads in team 
+    std::size_t team_size;
+    // vector length of team
+    std::size_t vector_length;
+    // Size of tile
+    std::size_t tile_size;
+    // Size of array
+    std::size_t array_size;
+    // Number of tiles
+    std::size_t num_tiles;
+    // Per team reserved LDS memory, used for reduction
+    std::size_t reduce_size;
+    // Per team shared memory in LDS, this in addition to reduce shared mem
+    std::size_t shared_size;
+    std::size_t size;
+};
+
+template<class T>
+tile_desc get_tile_desc(std::size_t size, 
+                        std::size_t array_size=1,
+                        std::size_t team_size=64,
+                        std::size_t vector_size=1,
+                        std::size_t shared_size=0)
+{
+    tile_desc result;
+    result.elements = size;
+    result.array_size = array_size;
+    result.vector_length = vector_size;
+    result.team_size = team_size;
+    result.tile_size = get_tile_size<T>(array_size,team_size,vector_size);
+    result.num_tiles = std::ceil(1.0 * size / result.tile_size);
+    result.reduce_size = result.tile_size * sizeof(T) * array_size;
+    result.shared_size = shared_size;
+    result.size = result.tile_size * result.num_tiles;
+
+    return result;
+}
+
+template<class U, class F, class T=typename std::remove_extent<U>::type>
+hc::completion_future tile_for(tile_desc td, F f) 
+{
+    assert(td.array_size <= get_max_tile_array_size() && "Exceed max array size");
+    assert(((td.size % td.tile_size) == 0) && "Tile size must be divisible by extent");
+    auto grid = hc::extent<1>(td.size).tile_with_dynamic(
+                          td.tile_size, td.reduce_size + td.shared_size);
+    // grid.set_dynamic_group_segment_size(td.reduce_size + td.shared_size);
+    return parallel_for_each(grid, [=](hc::tiled_index<1> t_idx) [[hc]] 
+    {
+#if defined (ROCM15)
+        typedef T group_t;
+#else
+        typedef __attribute__((address_space(3))) T group_t;
+#endif
+        group_t * buffer = (group_t *)hc::get_dynamic_group_segment_base_pointer();
+        tile_buffer<U> tb(buffer, td.tile_size, td.array_size);
+        zero_init(tb[t_idx.local[0]], td.array_size);
+        f(t_idx, tb);
+    });
+}
+
+}}
+
+#endif
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Vectorization.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Vectorization.hpp
@ -0,0 +1,346 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ROCM_VECTORIZATION_HPP
+#define KOKKOS_ROCM_VECTORIZATION_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if ROCM is enabled for Kokkos */
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <Kokkos_ROCm.hpp>
+
+namespace Kokkos {
+using namespace hc;
+
+// Shuffle only makes sense on >= Fiji GPUs; it doesn't work on CPUs
+// or other GPUs.  We provide a generic definition (which is trivial
+// and doesn't do what it claims to do) because we don't actually use
+// this function unless we are on a suitable GPU, with a suitable
+// Scalar type.  (For example, in the mat-vec, the "ThreadsPerRow"
+// internal parameter depends both on the ExecutionSpace and the Scalar type,
+// and it controls whether shfl_down() gets called.)
+namespace Impl {
+
+  template< typename Scalar >
+  struct shfl_union {
+    enum {n = sizeof(Scalar)/4};
+    float fval[n];
+    KOKKOS_INLINE_FUNCTION
+    Scalar value() {
+      return *(Scalar*) fval;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (const Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+
+  };
+}
+
+#ifdef __HCC_ACCELERATOR__
+
+    KOKKOS_INLINE_FUNCTION
+    int __long2loint(const long val ) {
+    union {
+      long l;
+      int i[2];
+    } u;
+      u.l = val;
+      return u.i[0];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int __long2hiint(const long val ) {
+    union {
+      long l;
+      int i[2];
+    } u;
+      u.l = val;
+      return u.i[1];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int __double2loint(const double val ) {
+    union {
+      double d;
+      int i[2];
+    } u;
+      u.d = val;
+      return u.i[0];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int __double2hiint(const double val ) {
+    union {
+      double d;
+      int i[2];
+    } u;
+      u.d = val;
+      return u.i[1];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    long __hiloint2long(const int hi, const int lo ) {
+    union {
+      long l;
+      int i[2];
+    } u;
+      u.i[0] = lo;
+      u.i[1] = hi;
+      return u.l;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double __hiloint2double(const int hi, const int lo ) {
+    union {
+      double d;
+      int i[2];
+    } u;
+      u.i[0] = lo;
+      u.i[1] = hi;
+      return u.d;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl(const int &val, const int& srcLane, const int& width ) {
+      return __shfl(val,srcLane,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl(const float &val, const int& srcLane, const int& width ) {
+      return __shfl(val,srcLane,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
+        ) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl(tmp,srcLane,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl(const double &val, const int& srcLane, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl(lo,srcLane,width);
+      hi = __shfl(hi,srcLane,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl(lo,srcLane,width);
+      hi = __shfl(hi,srcLane,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_down(const int &val, const int& delta, const int& width) {
+      return __shfl_down(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_down(const float &val, const int& delta, const int& width) {
+      return __shfl_down(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl_down(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    long shfl_down(const long &val, const int& delta, const int& width) {
+      int lo = __long2loint(val);
+      int hi = __long2hiint(val);
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      return __hiloint2long(hi,lo);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_down(const double &val, const int& delta, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_up(const int &val, const int& delta, const int& width ) {
+      return __shfl_up(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_up(const float &val, const int& delta, const int& width ) {
+      return __shfl_up(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl_up(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_up(const double &val, const int& delta, const int& width ) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl_up(lo,delta,width);
+      hi = __shfl_up(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl_up(lo,delta,width);
+      hi = __shfl_up(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+#else
+    template<typename Scalar>
+    inline
+    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<8.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<8.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<8.0.");
+      return val;
+    }
+#endif
+
+
+
+}
+
+#endif // KOKKOS_ENABLE_ROCM
+#endif
--- a/lib/kokkos/core/src/ROCm/hc_math_std.hpp
+++ b/lib/kokkos/core/src/ROCm/hc_math_std.hpp
@ -0,0 +1,367 @@
+#pragma once
+
+#include "hc.hpp"
+#include <cmath>
+
+// Math functions with integer overloads will be converted to
+// this floating point type.
+#define HC_IMPLICIT_FLOAT_CONV double
+
+#ifdef __KALMAR_ACCELERATOR__
+
+#define HC_MATH_WRAPPER_1(function, arg1) \
+template<typename T> \
+inline T function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define KALMAR_MATH_WRAPPER_1(function, arg1) HC_MATH_WRAPPER_1(function, arg1)
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
+} \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_floating_point <T>::value,T>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) 
+
+#define HC_MATH_WRAPPER_2(function, arg1, arg2) \
+template<typename T> \
+inline T function(T arg1, T arg2) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1, arg2); \
+}
+
+#define HC_MATH_ALIAS_2(alias, function, arg1, arg2) \
+template<typename T> \
+inline T alias(T arg1, T arg2) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1, arg2); \
+}
+
+#define HC_MATH_WRAPPER_3(function, arg1, arg2, arg3) \
+template<typename T> \
+inline T function(T arg1, T arg2, T arg3) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1, arg2, arg3); \
+}
+
+#define HC_MATH_WRAPPER_TQ(function, arg1) \
+template<typename T, typename Q> \
+inline T function(Q arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_TQ(function, T, arg1) \
+template<typename Q> \
+inline \
+typename std::enable_if<std::is_integral<Q>::value,T>::type \
+function(Q arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
+}\
+template<typename Q> \
+inline \
+typename std::enable_if<std::is_floating_point<Q>::value,T>::type \
+function(Q arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_TTQ(function, arg1, arg2) \
+template<typename T, typename Q> \
+inline T function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1, arg2); \
+}
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(function, arg1, arg2) \
+template<typename T, typename Q> \
+inline \
+typename std::enable_if<std::is_integral<T>::value||std::is_integral<Q>::value,HC_IMPLICIT_FLOAT_CONV>::type \
+function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1),static_cast<HC_IMPLICIT_FLOAT_CONV>(arg2)); \
+}\
+template<typename T, typename Q> \
+inline \
+typename std::enable_if<std::is_floating_point<T>::value&&std::is_floating_point<Q>::value,T>::type \
+function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1,arg2); \
+}
+
+#define HC_MATH_WRAPPER_TTTQ(function, arg1, arg2, arg3) \
+template<typename T, typename Q> \
+inline T function(T arg1, T arg2, Q arg3) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1, arg2, arg3); \
+}
+
+#define HC_MATH_WRAPPER_VTQQ(function, arg1, arg2, arg3) \
+template<typename T, typename Q> \
+inline void function(T arg1, Q arg2, Q arg3) __attribute__((hc,cpu)) { \
+  hc::precise_math::function(arg1, arg2, arg3); \
+}
+
+#else
+
+#define HC_MATH_WRAPPER_1(function, arg1) \
+template<typename T> \
+inline T function(T arg1) __attribute__((hc,cpu)) { \
+  return std::function(arg1); \
+}
+
+#define KALMAR_MATH_WRAPPER_1(function, arg1) \
+template<typename T> \
+inline T function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return ::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
+} \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_floating_point <T>::value,T>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return std::function(arg1); \
+} 
+
+#define KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
+} \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_floating_point <T>::value,T>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_2(function, arg1, arg2) \
+template<typename T> \
+inline T function(T arg1, T arg2) __attribute__((hc,cpu)) { \
+  return std::function(arg1, arg2); \
+}
+
+#define HC_MATH_ALIAS_2(alias, function, arg1, arg2) \
+template<typename T> \
+inline T alias(T arg1, T arg2) __attribute__((hc,cpu)) { \
+  return std::function(arg1, arg2); \
+}
+
+#define HC_MATH_WRAPPER_3(function, arg1, arg2, arg3) \
+template<typename T> \
+inline T function(T arg1, T arg2, T arg3) __attribute__((hc,cpu)) { \
+  return std::function(arg1, arg2, arg3); \
+}
+
+#define HC_MATH_WRAPPER_TQ(function, arg1) \
+template<typename T, typename Q> \
+inline T function(Q arg1) __attribute__((hc,cpu)) { \
+  return std::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_TQ(function, T, arg1) \
+template<typename Q> \
+inline \
+typename std::enable_if<std::is_integral<Q>::value,T>::type \
+function(Q arg1) __attribute__((hc)) { \
+  return std::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
+}\
+template<typename Q> \
+inline \
+typename std::enable_if<std::is_floating_point<Q>::value,T>::type \
+function(Q arg1) __attribute__((hc)) { \
+  return std::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_TTQ(function, arg1, arg2) \
+template<typename T, typename Q> \
+inline T function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return std::function(arg1, arg2); \
+}
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(function, arg1, arg2) \
+template<typename T, typename Q> \
+inline \
+typename std::enable_if<std::is_integral<T>::value||std::is_integral<Q>::value,HC_IMPLICIT_FLOAT_CONV>::type \
+function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return std::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1),static_cast<HC_IMPLICIT_FLOAT_CONV>(arg2)); \
+}\
+template<typename T, typename Q> \
+inline \
+typename std::enable_if<std::is_floating_point<T>::value&&std::is_floating_point<Q>::value,T>::type \
+function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return std::function(arg1,arg2); \
+}
+
+#define HC_MATH_WRAPPER_TTTQ(function, arg1, arg2, arg3) \
+template<typename T, typename Q> \
+inline T function(T arg1, T arg2, Q arg3) __attribute__((hc,cpu)) { \
+  return std::function(arg1, arg2, arg3); \
+}
+
+#define HC_MATH_WRAPPER_VTQQ(function, arg1, arg2, arg3) \
+template<typename T, typename Q> \
+inline void function(T arg1, Q arg2, Q arg3) __attribute__((hc,cpu)) { \
+  std::function(arg1, arg2, arg3); \
+}
+
+#endif
+
+
+// override global math functions
+namespace std {
+
+// following math functions are NOT available because they don't have a GPU implementation
+//
+// erfinv
+// erfcinv
+// fpclassify
+// 
+// following math functions are NOT available because they don't have a CPU implementation
+//
+// cospif
+// cospi
+// rsqrtf
+// rsqrt
+// sinpif
+// sinpi
+// tanpi
+//
+
+HC_MATH_WRAPPER_TQ(ilogbf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_TQ(ilogb, int, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isfinite, bool, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isinf, bool, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isnan, bool, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isnormal, bool, x)
+HC_MATH_WRAPPER_TQ(nanf, tagp)
+HC_MATH_WRAPPER_TQ(nan, tagp)
+//HC_MATH_WRAPPER_TQ(signbitf, x)
+HC_MATH_WRAPPER_TQ(signbit, x)
+HC_MATH_WRAPPER_TTQ(frexpf, x, exp)
+HC_MATH_WRAPPER_TTQ(frexp, x, exp)
+HC_MATH_WRAPPER_TTQ(ldexpf, x, exp)
+HC_MATH_WRAPPER_TTQ(ldexp, x, exp)
+HC_MATH_WRAPPER_TTQ(lgammaf, x, exp)
+HC_MATH_WRAPPER_TTQ(lgamma, x, exp)
+HC_MATH_WRAPPER_TTQ(modff, x, exp)
+HC_MATH_WRAPPER_TTQ(modf, x, exp)
+HC_MATH_WRAPPER_TTQ(scalbnf, x, exp)
+HC_MATH_WRAPPER_TTQ(scalbn, x, exp)
+HC_MATH_WRAPPER_TTTQ(remquof, x, y, quo)
+HC_MATH_WRAPPER_TTTQ(remquo, x, y, quo)
+HC_MATH_WRAPPER_VTQQ(sincosf, x, s, c)
+HC_MATH_WRAPPER_VTQQ(sincos, x, s, c)
+
+HC_MATH_WRAPPER_1(acosf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(acos, x)
+HC_MATH_WRAPPER_1(acoshf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(acosh, x)
+HC_MATH_WRAPPER_1(asinf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(asin, x)
+HC_MATH_WRAPPER_1(asinhf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(asinh, x)
+HC_MATH_WRAPPER_1(atanf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(atan, x)
+HC_MATH_WRAPPER_1(atanhf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(atanh, x)
+HC_MATH_WRAPPER_2(atan2f, x, y)
+HC_MATH_WRAPPER_2(atan2, x, y)
+HC_MATH_WRAPPER_1(cbrtf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(cbrt, x)
+HC_MATH_WRAPPER_1(ceilf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(ceil, x)
+HC_MATH_WRAPPER_2(copysignf, x, y)
+HC_MATH_WRAPPER_2(copysign, x, y)
+HC_MATH_WRAPPER_1(cosf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(cos, x)
+HC_MATH_WRAPPER_1(coshf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(cosh, x)
+KALMAR_MATH_WRAPPER_1(cospif, x)
+KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(cospi, x)
+HC_MATH_WRAPPER_1(erff, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(erf, x)
+HC_MATH_WRAPPER_1(erfcf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(erfc, x)
+HC_MATH_WRAPPER_1(expf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(exp, x)
+HC_MATH_WRAPPER_1(exp2f, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(exp2, x)
+HC_MATH_WRAPPER_1(exp10f, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(exp10, x)
+HC_MATH_WRAPPER_1(expm1f, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(expm1, x)
+HC_MATH_WRAPPER_1(fabsf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(fabs, x)
+HC_MATH_WRAPPER_2(fdimf, x, y)
+HC_MATH_WRAPPER_2(fdim, x, y)
+HC_MATH_WRAPPER_1(floorf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(floor, x)
+HC_MATH_WRAPPER_3(fmaf, x, y, z)
+HC_MATH_WRAPPER_3(fma, x, y, z)
+HC_MATH_WRAPPER_2(fmaxf, x, y)
+HC_MATH_WRAPPER_2(fmax, x, y)
+HC_MATH_WRAPPER_2(fminf, x, y)
+HC_MATH_WRAPPER_2(fmin, x, y)
+HC_MATH_WRAPPER_2(fmodf, x, y)
+HC_MATH_WRAPPER_2(fmod, x, y)
+HC_MATH_WRAPPER_2(hypotf, x, y)
+HC_MATH_WRAPPER_2(hypot, x, y)
+HC_MATH_WRAPPER_1(logf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(log, x)
+HC_MATH_WRAPPER_1(log10f, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(log10, x)
+HC_MATH_WRAPPER_1(log2f, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(log2, x)
+HC_MATH_WRAPPER_1(log1pf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(log1p, x)
+HC_MATH_WRAPPER_1(logbf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(logb, x)
+HC_MATH_WRAPPER_1(nearbyintf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(nearbyint, x)
+HC_MATH_WRAPPER_2(nextafterf, x, y)
+HC_MATH_WRAPPER_2(nextafter, x, y)
+HC_MATH_WRAPPER_2(powf, x, y)
+HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(pow,x,y)
+//HC_MATH_WRAPPER_1(rcbrtf, x)
+//HC_MATH_WRAPPER_1(rcbrt, x)
+HC_MATH_WRAPPER_2(remainderf, x, y)
+HC_MATH_WRAPPER_2(remainder, x, y)
+HC_MATH_WRAPPER_1(roundf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(round, x)
+KALMAR_MATH_WRAPPER_1(rsqrtf, x)
+KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(rsqrt, x)
+HC_MATH_WRAPPER_2(scalbf, x, exp)
+HC_MATH_WRAPPER_2(scalb, x, exp)
+HC_MATH_WRAPPER_1(sinf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(sin, x)
+HC_MATH_WRAPPER_1(sinhf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(sinh, x)
+KALMAR_MATH_WRAPPER_1(sinpif, x)
+KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(sinpi, x)
+HC_MATH_WRAPPER_1(sqrtf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(sqrt, x)
+HC_MATH_WRAPPER_1(tgammaf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(tgamma, x)
+HC_MATH_WRAPPER_1(tanf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(tan, x)
+HC_MATH_WRAPPER_1(tanhf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(tanh, x)
+HC_MATH_WRAPPER_1(truncf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(trunc, x)
+
+//HC_MATH_ALIAS_2(min, fmin, x, y)
+//HC_MATH_ALIAS_2(max, fmax, x, y)
+
+} // namespace
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@ -125,6 +125,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
 //----------------------------------------------------------------------------
 // GCC native CAS supports int, long, unsigned int, unsigned long.
 // Intel native CAS support int and long with the same interface as GCC.
+#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
 #if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
 #if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)

@ -280,6 +281,7 @@ T atomic_compare_exchange( volatile T * const dest, const T compare, const T val

 #endif
 #endif
+#endif // !defined ROCM_ATOMICS

 template <typename T>
 KOKKOS_INLINE_FUNCTION
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@ -158,6 +158,7 @@ T atomic_fetch_add( volatile T * const dest ,
 #endif
 #endif
 //----------------------------------------------------------------------------
+#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
 #if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
 #if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)

@ -355,6 +356,7 @@ T atomic_fetch_add( volatile T * const dest , const T val )

 #endif
 #endif
+#endif // !defined ROCM_ATOMICS
 //----------------------------------------------------------------------------

 // Simpler version of atomic_fetch_add without the fetch
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@ -135,6 +135,7 @@ T atomic_fetch_sub( volatile T * const dest ,
 #endif
 #endif
 //----------------------------------------------------------------------------
+#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
 #if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
 #if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)

@ -263,6 +264,8 @@ T atomic_fetch_sub( volatile T * const dest , const T val )

 #endif
 #endif
+#endif // !defined ROCM_ATOMICS
+
 // Simpler version of atomic_fetch_sub without the fetch
 template <typename T>
 KOKKOS_INLINE_FUNCTION
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@ -238,7 +238,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
  *dest = Oper::apply(return_val, val);
  Impl::unlock_address_host_space( (void*) dest );
  return return_val;
-#else
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
  // This is a way to (hopefully) avoid dead lock in a warp
  T return_val;
  int done = 0;
@ -277,7 +277,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
  *dest = return_val;
  Impl::unlock_address_host_space( (void*) dest );
  return return_val;
-#else
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
  T return_val;
  // This is a way to (hopefully) avoid dead lock in a warp
  int done = 0;
--- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@ -62,6 +62,8 @@ int bit_first_zero( unsigned i ) noexcept

 #if defined( __CUDA_ARCH__ )
  return full != i ? __ffs( ~i ) - 1 : -1 ;
+#elif defined( __HCC_ACCELERATOR__ )
+  return full != i ? (int)hc::__firstbit_u32_u32(~i) : -1 ;
 #elif defined( KOKKOS_COMPILER_INTEL )
  return full != i ? _bit_scan_forward( ~i ) : -1 ;
 #elif defined( KOKKOS_COMPILER_IBM )
@ -82,6 +84,8 @@ int bit_scan_forward( unsigned i )
 {
 #if defined( __CUDA_ARCH__ )
  return __ffs(i) - 1;
+#elif defined( __HCC_ACCELERATOR__ )
+  return  (int)hc::__firstbit_u32_u32(i);
 #elif defined( KOKKOS_COMPILER_INTEL )
  return _bit_scan_forward(i);
 #elif defined( KOKKOS_COMPILER_IBM )
@ -106,6 +110,8 @@ int bit_scan_reverse( unsigned i )
  enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
 #if defined( __CUDA_ARCH__ )
  return shift - __clz(i);
+#elif defined( __HCC_ACCELERATOR__ )
+  return  (int)hc::__firstbit_u32_u32(i);
 #elif defined( KOKKOS_COMPILER_INTEL )
  return _bit_scan_reverse(i);
 #elif defined( KOKKOS_COMPILER_IBM )
@ -130,6 +136,8 @@ int bit_count( unsigned i )
 {
 #if defined( __CUDA_ARCH__ )
  return __popc(i);
+#elif defined( __HCC_ACCELERATOR__ )
+  return  (int)hc::__popcount_u32_b32(i);
 #elif defined ( __INTEL_COMPILER )
  return _popcnt32(i);
 #elif defined( KOKKOS_COMPILER_IBM )
--- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@ -72,6 +72,10 @@ uint64_t clock_tic(void) noexcept

  return clock64();

+#elif defined(__HCC_ACCELERATOR__)
+    // Get clock register
+    return hc::__clock_u64();
+
 #elif defined( __i386__ ) || defined( __x86_64 )

  // Return value of 64-bit hi-res clock register.
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -80,7 +80,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
  const int num_threads = args.num_threads;
  const int use_numa = args.num_numa;
 #endif // defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS )
-#if defined( KOKKOS_ENABLE_CUDA )
+#if defined( KOKKOS_ENABLE_CUDA ) || defined( KOKKOS_ENABLE_ROCM )
  const int use_gpu = args.device_id;
 #endif // defined( KOKKOS_ENABLE_CUDA )

@ -162,6 +162,18 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
  }
 #endif

+#if defined( KOKKOS_ENABLE_ROCM )
+  if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
+    if (use_gpu > -1) {
+      Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( use_gpu ) );
+    }
+    else {
+      Kokkos::Experimental::ROCm::initialize();
+    }
+    std::cout << "Kokkos::initialize() fyi: ROCm enabled and initialized" << std::endl ;
+  }
+#endif
+
 #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
 #endif
@ -181,6 +193,13 @@ void finalize_internal( const bool all_spaces = false )
  }
 #endif

+#if defined( KOKKOS_ENABLE_ROCM )
+  if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
+    if(Kokkos::Experimental::ROCm::is_initialized())
+      Kokkos::Experimental::ROCm::finalize();
+  }
+#endif
+
 #if defined( KOKKOS_ENABLE_OPENMPTARGET )
  if( std::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
    if(Kokkos::Experimental::OpenMPTarget::is_initialized())
@ -225,6 +244,12 @@ void fence_internal()
  }
 #endif

+#if defined( KOKKOS_ENABLE_ROCM )
+  if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value ) {
+    Kokkos::Experimental::ROCm::fence();
+  }
+#endif
+
 #if defined( KOKKOS_ENABLE_OPENMP )
  if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
      std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
--- a/lib/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
@ -75,7 +75,7 @@ void abort( const char * const message ) {
 #ifdef __CUDA_ARCH__
  Kokkos::Impl::cuda_abort(message);
 #else
-  #ifndef KOKKOS_ENABLE_OPENMPTARGET
+  #if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(__HCC_ACCELERATOR__)
    Kokkos::Impl::host_abort(message);
  #endif
 #endif
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@ -275,7 +275,9 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
      for ( int i = 0 ; i < end ; ++i ) {
        ((int8_t*) & value )[i] = int8_t( step );
      }
-
+      // Do not REMOVE this store fence!!!
+      // Makes stuff hang on GCC with more than 8 threads
+      store_fence();
      spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
                          , value );
    }
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@ -53,6 +53,8 @@ void memory_fence()
 {
 #if defined( __CUDA_ARCH__ )
  __threadfence();
+#elif defined( KOKKOS_ENABLE_ROCM_ATOMICS )
+  amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
 #elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
  asm volatile (
 	  "mfence" ::: "memory"
--- a/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
@ -140,12 +140,6 @@
 #endif
 #endif

-#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
-#ifndef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
-#endif
-#endif
-
 #ifdef KOKKOS_HAVE_CXX1Z
 #ifndef KOKKOS_ENABLE_CXX1Z
 #define KOKKOS_ENABLE_CXX1Z KOKKOS_HAVE_CXX1Z
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@ -456,10 +456,11 @@ void TaskQueue< ExecSpace >::schedule_aggregate
  //     task->m_next == member of linked list (queue)

 #if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
-  printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+  printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d %d }\n"
        , uintptr_t(task)
        , uintptr_t(task->m_wait)
        , uintptr_t(task->m_next)
+        , task->m_dep_count
        , task->m_task_type
        , task->m_priority
        , task->m_ref_count );
@ -597,7 +598,6 @@ void TaskQueue< ExecSpace >::complete
        , task->m_task_type
        , task->m_priority
        , task->m_ref_count );
-  fflush( stdout );
 #endif

  task_root_type volatile & t = *task ;
--- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@ -1015,8 +1015,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
    {
-      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
-                   , "ViewOffset LayoutLeft and LayoutRight are only compatible when rank == 1" );
+      static_assert(
+        ( DimRHS::rank == 0 &&
+          dimension_type::rank == 0 ) ||
+        ( DimRHS::rank == 1 &&
+          dimension_type::rank == 1 &&
+          dimension_type::rank_dynamic == 1 )
+        , "ViewOffset LayoutLeft and LayoutRight are only compatible when rank <= 1" );
    }

  template< class DimRHS >
@ -1024,8 +1029,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
  ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
    {
-      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
-                   , "ViewOffset LayoutLeft and LayoutStride are only compatible when rank == 1" );
+      static_assert( 
+        ( DimRHS::rank == 0 &&
+          dimension_type::rank == 0 ) ||
+        ( DimRHS::rank == 1 &&
+          dimension_type::rank == 1 &&
+          dimension_type::rank_dynamic == 1 )
+        , "ViewOffset LayoutLeft and LayoutStride are only compatible when rank <= 1" );
      if ( rhs.m_stride.S0 != 1 ) {
        Kokkos::abort("Kokkos::Impl::ViewOffset assignment of LayoutLeft from LayoutStride  requires stride == 1" );
      }
@ -1493,8 +1503,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutRight
  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
    {
-      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
-                   , "ViewOffset LayoutRight and LayoutLeft are only compatible when rank == 1" );
+      static_assert(
+       ( DimRHS::rank == 0 &&
+         dimension_type::rank == 0 ) ||
+       ( DimRHS::rank == 1 &&
+         dimension_type::rank == 1 &&
+         dimension_type::rank_dynamic == 1 )
+      , "ViewOffset LayoutRight and LayoutLeft are only compatible when rank <= 1" );
    }

  template< class DimRHS >
@ -1502,8 +1517,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutRight
  ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
    {
-      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
-                   , "ViewOffset LayoutLeft/Right and LayoutStride are only compatible when rank == 1" );
+      static_assert(
+       ( DimRHS::rank == 0 &&
+         dimension_type::rank == 0 ) ||
+       ( DimRHS::rank == 1 &&
+         dimension_type::rank == 1 &&
+         dimension_type::rank_dynamic == 1 )
+      , "ViewOffset LayoutRight and LayoutString are only compatible when rank <= 1" );
      if ( rhs.m_stride.S0 != 1 ) {
        Kokkos::abort("Kokkos::Impl::ViewOffset assignment of LayoutLeft/Right from LayoutStride  requires stride == 1" );
      }
--- a/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp
@ -143,7 +143,7 @@ public:

  //----------------------------------------

-  ~ViewOffset() = default ;
+  KOKKOS_FUNCTION_DEFAULTED ~ViewOffset() = default ;
  KOKKOS_INLINE_FUNCTION ViewOffset() = default ;
  KOKKOS_INLINE_FUNCTION ViewOffset( const ViewOffset & ) = default ;
  KOKKOS_INLINE_FUNCTION ViewOffset & operator = ( const ViewOffset & ) = default ;