Update Kokkos library to v2.03.13

2017-07-31 10:34:21 -06:00
parent 6d0a228624
commit 13f2d39f55
171 changed files with 44039 additions and 2807 deletions
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -53,6 +53,7 @@
 #include <impl/Kokkos_Error.hpp>
 #include <Cuda/Kokkos_Cuda_abort.hpp>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -125,53 +126,12 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:

 #endif

-
-namespace Kokkos {
-namespace Impl {
-  struct CudaLockArraysStruct {
-    int* atomic;
-    int* scratch;
-    int* threadid;
-    int n;
-  };
-}
-}
-__device__ __constant__
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-extern
-#endif
-Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
-
-#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
-#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
-
 namespace Kokkos {
 namespace Impl {
  void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
 }
 }

-namespace Kokkos {
-namespace Impl {
-__device__ inline
-bool lock_address_cuda_space(void* ptr) {
-  size_t offset = size_t(ptr);
-  offset = offset >> 2;
-  offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
-}
-
-__device__ inline
-void unlock_address_cuda_space(void* ptr) {
-  size_t offset = size_t(ptr);
-  offset = offset >> 2;
-  offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
-}
-
-}
-}
-
 template< typename T >
 inline
 __device__
@ -192,7 +152,7 @@ namespace Impl {
 // For 2.0 capability: 48 KB L1 and 16 KB shared
 //----------------------------------------------------------------------------

-template< class DriverType >
+template< class DriverType>
 __global__
 static void cuda_parallel_launch_constant_memory()
 {
@ -202,19 +162,39 @@ static void cuda_parallel_launch_constant_memory()
  driver();
 }

-template< class DriverType >
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType>
 __global__
 static void cuda_parallel_launch_local_memory( const DriverType driver )
 {
  driver();
 }

-template < class DriverType ,
-           bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template < class DriverType
+         , class LaunchBounds = Kokkos::LaunchBounds<>
+         , bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
 struct CudaParallelLaunch ;

-template < class DriverType >
-struct CudaParallelLaunch< DriverType , true > {
+template < class DriverType, class LaunchBounds >
+struct CudaParallelLaunch< DriverType, LaunchBounds, true > {

  inline
  CudaParallelLaunch( const DriverType & driver
@ -238,26 +218,19 @@ struct CudaParallelLaunch< DriverType , true > {
      }
      #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
      else if ( shmem ) {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
      } else {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
      }
      #endif

      // Copy functor to constant memory on the device
      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );

-      #ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-      Kokkos::Impl::CudaLockArraysStruct locks;
-      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-      locks.n = Kokkos::Cuda::concurrency();
-      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-      #endif
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();

      // Invoke the driver function on the device
-      cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
+      cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();

 #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
      CUDA_SAFE_CALL( cudaGetLastError() );
@ -267,8 +240,8 @@ struct CudaParallelLaunch< DriverType , true > {
  }
 };

-template < class DriverType >
-struct CudaParallelLaunch< DriverType , false > {
+template < class DriverType, class LaunchBounds >
+struct CudaParallelLaunch< DriverType, LaunchBounds, false > {

  inline
  CudaParallelLaunch( const DriverType & driver
@ -284,22 +257,15 @@ struct CudaParallelLaunch< DriverType , false > {
      }
      #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
      else if ( shmem ) {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
      } else {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
      }
      #endif

-      #ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-      Kokkos::Impl::CudaLockArraysStruct locks;
-      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-      locks.n = Kokkos::Cuda::concurrency();
-      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-      #endif
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();

-      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
+      cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );

 #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
      CUDA_SAFE_CALL( cudaGetLastError() );
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -230,18 +230,6 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t
  } catch(...) {}
 }

-constexpr const char* CudaSpace::name() {
-  return m_name;
-}
-
-constexpr const char* CudaUVMSpace::name() {
-  return m_name;
-}
-
-constexpr const char* CudaHostPinnedSpace::name() {
-  return m_name;
-}
-
 } // namespace Kokkos

 //----------------------------------------------------------------------------
@ -655,11 +643,12 @@ reallocate_tracked( void * const arg_alloc_ptr
 SharedAllocationRecord< Kokkos::CudaSpace , void > *
 SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
 {
-  using Header     = SharedAllocationHeader ;
  using RecordBase = SharedAllocationRecord< void , void > ;
  using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;

 #if 0
+  using Header     = SharedAllocationHeader ;
+
  // Copy the header from the allocation
  Header head ;

@ -812,83 +801,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
 }

-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace {
-  __global__ void init_lock_array_kernel_atomic() {
-    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if(i<CUDA_SPACE_ATOMIC_MASK+1)
-      kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
-  }
-
-  __global__ void init_lock_array_kernel_scratch_threadid(int N) {
-    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if(i<N) {
-      kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
-      kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
-    }
-  }
-}
-
-
-namespace Impl {
-int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
-  static int* ptr = NULL;
-  if(deallocate) {
-    cudaFree(ptr);
-    ptr = NULL;
-  }
-
-  if(ptr==NULL && !deallocate)
-    cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1));
-  return ptr;
-}
-
-int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
-  static int* ptr = NULL;
-  if(deallocate) {
-    cudaFree(ptr);
-    ptr = NULL;
-  }
-
-  if(ptr==NULL && !deallocate)
-    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
-  return ptr;
-}
-
-int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
-  static int* ptr = NULL;
-  if(deallocate) {
-    cudaFree(ptr);
-    ptr = NULL;
-  }
-
-  if(ptr==NULL && !deallocate)
-    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
-  return ptr;
-}
-
-void init_lock_arrays_cuda_space() {
-  static int is_initialized = 0;
-  if(! is_initialized) {
-    Kokkos::Impl::CudaLockArraysStruct locks;
-    locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-    locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-    locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-    locks.n = Kokkos::Cuda::concurrency();
-    cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-    init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
-    init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
-  }
-}
-
 void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
  static void* ptr = NULL;
  static std::int64_t current_size = 0;
@ -908,8 +820,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
  return ptr;
 }

-}
-}
+} // namespace Impl
+} // namespace Kokkos
 #else
 void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
 #endif // KOKKOS_ENABLE_CUDA
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -51,6 +51,7 @@

 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>

@ -69,9 +70,6 @@
 __device__ __constant__
 unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;

-__device__ __constant__
-Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
-
 #endif

 /*--------------------------------------------------------------------------*/
@ -103,6 +101,7 @@ int cuda_kernel_arch()
  return arch ;
 }

+#ifdef KOKKOS_ENABLE_CUDA_UVM
 bool cuda_launch_blocking()
 {
  const char * env = getenv("CUDA_LAUNCH_BLOCKING");
@ -111,16 +110,13 @@ bool cuda_launch_blocking()

  return atoi(env);
 }
+#endif

 }

 void cuda_device_synchronize()
 {
-//  static const bool launch_blocking = cuda_launch_blocking();
-
-//  if (!launch_blocking) {
-    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
-//  }
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
 }

 void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
@ -240,6 +236,7 @@ public:
  unsigned    m_maxWarpCount ;
  unsigned    m_maxBlock ;
  unsigned    m_maxSharedWords ;
+  uint32_t    m_maxConcurrency ;
  size_type   m_scratchSpaceCount ;
  size_type   m_scratchFlagsCount ;
  size_type   m_scratchUnifiedCount ;
@ -248,6 +245,7 @@ public:
  size_type * m_scratchSpace ;
  size_type * m_scratchFlags ;
  size_type * m_scratchUnified ;
+  uint32_t  * m_scratchConcurrentBitset ;
  cudaStream_t * m_stream ;

  static int was_initialized;
@ -274,6 +272,7 @@ public:
    , m_maxWarpCount( 0 )
    , m_maxBlock( 0 )
    , m_maxSharedWords( 0 )
+    , m_maxConcurrency( 0 )
    , m_scratchSpaceCount( 0 )
    , m_scratchFlagsCount( 0 )
    , m_scratchUnifiedCount( 0 )
@ -282,6 +281,7 @@ public:
    , m_scratchSpace( 0 )
    , m_scratchFlags( 0 )
    , m_scratchUnified( 0 )
+    , m_scratchConcurrentBitset( 0 )
    , m_stream( 0 )
    {}

@ -327,7 +327,8 @@ CudaInternal::~CudaInternal()
  if ( m_stream ||
       m_scratchSpace ||
       m_scratchFlags ||
-       m_scratchUnified ) {
+       m_scratchUnified ||
+       m_scratchConcurrentBitset ) {
    std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
              << std::endl ;
    std::cerr.flush();
@ -339,6 +340,7 @@ CudaInternal::~CudaInternal()
  m_maxWarpCount            = 0 ;
  m_maxBlock                = 0 ;
  m_maxSharedWords          = 0 ;
+  m_maxConcurrency          = 0 ;
  m_scratchSpaceCount       = 0 ;
  m_scratchFlagsCount       = 0 ;
  m_scratchUnifiedCount     = 0 ;
@ -347,6 +349,7 @@ CudaInternal::~CudaInternal()
  m_scratchSpace            = 0 ;
  m_scratchFlags            = 0 ;
  m_scratchUnified          = 0 ;
+  m_scratchConcurrentBitset = 0 ;
  m_stream                  = 0 ;
 }

@ -485,6 +488,33 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
    }
    //----------------------------------
+    // Concurrent bitset for obtaining unique tokens from within
+    // an executing kernel.
+    {
+      const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0
+
+      m_maxConcurrency =
+        max_threads_per_sm * cudaProp.multiProcessorCount ;
+
+      const int32_t buffer_bound =
+         Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
+
+      // Allocate and initialize uint32_t[ buffer_bound ]
+
+      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
+
+      Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                         , "InternalScratchBitset"
+                                         , sizeof(uint32_t) * buffer_bound );
+
+      Record::increment( r );
+
+      m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>( r->data() );
+
+      CUDA_SAFE_CALL( cudaMemset( m_scratchConcurrentBitset , 0 , sizeof(uint32_t) * buffer_bound ) );
+
+    }
+    //----------------------------------

    if ( stream_count ) {
      m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
@ -543,16 +573,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);

  // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_arrays_cuda_space();
-
-  #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-  Kokkos::Impl::CudaLockArraysStruct locks;
-  locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-  locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-  locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-  locks.n = Kokkos::Cuda::concurrency();
-  cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-  #endif
+  Impl::initialize_host_cuda_lock_arrays();
 }

 //----------------------------------------------------------------------------
@ -635,9 +656,7 @@ void CudaInternal::finalize()
  was_finalized = 1;
  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {

-    atomic_lock_array_cuda_space_ptr(true);
-    scratch_lock_array_cuda_space_ptr(true);
-    threadid_lock_array_cuda_space_ptr(true);
+    Impl::finalize_host_cuda_lock_arrays();

    if ( m_stream ) {
      for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
@ -653,6 +672,7 @@ void CudaInternal::finalize()
    RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
    RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
    RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
+    RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );

    m_cudaDev             = -1 ;
    m_multiProcCount      = 0 ;
@ -666,6 +686,7 @@ void CudaInternal::finalize()
    m_scratchSpace        = 0 ;
    m_scratchFlags        = 0 ;
    m_scratchUnified      = 0 ;
+    m_scratchConcurrentBitset = 0 ;
    m_stream              = 0 ;
  }
 }
@ -713,9 +734,8 @@ namespace Kokkos {
 Cuda::size_type Cuda::detect_device_count()
 { return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }

-int Cuda::concurrency() {
-  return 131072;
-}
+int Cuda::concurrency()
+{ return Impl::CudaInternal::singleton().m_maxConcurrency ; }

 int Cuda::is_initialized()
 { return Impl::CudaInternal::singleton().is_initialized(); }
@ -798,7 +818,22 @@ void Cuda::fence()
 const char* Cuda::name() { return "Cuda"; }

 } // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+UniqueToken< Kokkos::Cuda , Kokkos::Experimental::UniqueTokenScope::Global >::
+UniqueToken( Kokkos::Cuda const & )
+  : m_buffer( Kokkos::Impl::CudaInternal::singleton().m_scratchConcurrentBitset )
+  , m_count(  Kokkos::Impl::CudaInternal::singleton().m_maxConcurrency )
+  {}
+
+} // namespace Experimental
+} // namespace Kokkos
+
 #else
+
 void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
+
 #endif // KOKKOS_ENABLE_CUDA

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Kokkos_Cuda.hpp>
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+namespace Kokkos {
+namespace Impl {
+__device__ __constant__
+CudaLockArrays g_device_cuda_lock_arrays = { nullptr, nullptr, 0 };
+}
+}
+#endif
+
+namespace Kokkos {
+
+namespace {
+
+__global__ void init_lock_array_kernel_atomic() {
+  unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i<CUDA_SPACE_ATOMIC_MASK+1) {
+    Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0;
+  }
+}
+
+__global__ void init_lock_array_kernel_threadid(int N) {
+  unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i<(unsigned)N) {
+    Kokkos::Impl::g_device_cuda_lock_arrays.scratch[i] = 0;
+  }
+}
+
+} // namespace
+
+namespace Impl {
+
+CudaLockArrays g_host_cuda_lock_arrays = { nullptr, nullptr, 0 };
+
+void initialize_host_cuda_lock_arrays() {
+  if (g_host_cuda_lock_arrays.atomic != nullptr) return;
+  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
+                 sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1)));
+  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
+                 sizeof(int)*(Cuda::concurrency())));
+  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  g_host_cuda_lock_arrays.n = Cuda::concurrency();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+1+255)/256,256>>>();
+  init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
+  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+}
+
+void finalize_host_cuda_lock_arrays() {
+  if (g_host_cuda_lock_arrays.atomic == nullptr) return;
+  cudaFree(g_host_cuda_lock_arrays.atomic);
+  g_host_cuda_lock_arrays.atomic = nullptr;
+  cudaFree(g_host_cuda_lock_arrays.scratch);
+  g_host_cuda_lock_arrays.scratch = nullptr;
+  g_host_cuda_lock_arrays.n = 0;
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#else
+
+void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {}
+
+#endif
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@ -0,0 +1,166 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_LOCKS_HPP
+#define KOKKOS_CUDA_LOCKS_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <cstdint>
+
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaLockArrays {
+  std::int32_t* atomic;
+  std::int32_t* scratch;
+  std::int32_t n;
+};
+
+/// \brief This global variable in Host space is the central definition
+///        of these arrays.
+extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays ;
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+void initialize_host_cuda_lock_arrays();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+void finalize_host_cuda_lock_arrays();
+
+} // namespace Impl
+} // namespace Kokkos
+
+#if defined( __CUDACC__ )
+
+namespace Kokkos {
+namespace Impl {
+
+/// \brief This global variable in CUDA space is what kernels use
+///        to get access to the lock arrays.
+///
+/// When relocatable device code is enabled, there can be one single
+/// instance of this global variable for the entire executable,
+/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
+/// here must then be extern.
+/// This one instance will be initialized by initialize_host_cuda_lock_arrays
+/// and need not be modified afterwards.
+///
+/// When relocatable device code is disabled, an instance of this variable
+/// will be created in every translation unit that sees this header file
+/// (we make this clear by marking it static, meaning no other translation
+///  unit can link to it).
+/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
+/// instances in other translation units, we must update this CUDA global
+/// variable based on the Host global variable prior to running any kernels
+/// that will use it.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+__device__ __constant__
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+extern
+#endif
+Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays ;
+
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline
+bool lock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset],0,1));
+}
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+__device__ inline
+void unlock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch( &Kokkos::Impl::g_device_cuda_lock_arrays.atomic[ offset ], 0);
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/* Dan Ibanez: it is critical that this code be a macro, so that it will
+   capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
+   putting this in an inline function will NOT do the right thing! */
+#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
+{ \
+  CUDA_SAFE_CALL(cudaMemcpyToSymbol( \
+        Kokkos::Impl::g_device_cuda_lock_arrays , \
+        & Kokkos::Impl::g_host_cuda_lock_arrays , \
+        sizeof(Kokkos::Impl::CudaLockArrays) ) ); \
+}
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@ -58,6 +58,7 @@
 #include <Cuda/Kokkos_CudaExec.hpp>
 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Kokkos_Vectorization.hpp>

 #if defined(KOKKOS_ENABLE_PROFILING)
@ -65,6 +66,8 @@
 #include <typeinfo>
 #endif

+#include <KokkosExp_MDRangePolicy.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

@ -318,6 +321,7 @@ private:
  typedef Kokkos::RangePolicy< Traits ... > Policy;
  typedef typename Policy::member_type  Member ;
  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds LaunchBounds ;

  const FunctorType  m_functor ;
  const Policy       m_policy ;
@ -363,7 +367,7 @@ public:
      const dim3 block(  1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
      const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1);

-      CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
    }

  ParallelFor( const FunctorType  & arg_functor ,
@ -373,6 +377,115 @@ public:
    { }
 };

+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::Cuda
+                 >
+{
+private:
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ...  > Policy ;
+  using RP = Policy;
+  typedef typename Policy::array_index_type array_index_type;
+  typedef typename Policy::index_type index_type;
+  typedef typename Policy::launch_bounds LaunchBounds;
+
+
+  const FunctorType m_functor ;
+  const Policy      m_rp ;
+
+public:
+
+  inline
+  __device__
+  void operator()(void) const
+    {
+      Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
+    }
+
+
+  inline
+  void execute() const
+  {
+    const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+    if ( RP::rank == 2 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
+      const dim3 grid(
+            std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+          , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+          , 1
+          );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 3 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
+      const dim3 grid(
+          std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+        , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 4 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 5 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 6 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        ,  std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
+                  , static_cast<index_type>(maxblocks) )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else
+    {
+      printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
+      Kokkos::abort("Aborting");
+    }
+
+  } //end execute
+
+//  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_functor( arg_functor )
+    , m_rp(  arg_policy )
+    {}
+};
+
+
 template< class FunctorType , class ... Properties >
 class ParallelFor< FunctorType
                 , Kokkos::TeamPolicy< Properties ... >
@ -384,6 +497,7 @@ private:
  typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... >   Policy ;
  typedef typename Policy::member_type  Member ;
  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds  LaunchBounds ;

 public:

@ -430,15 +544,15 @@ public:
    if ( m_scratch_size[1]>0 ) {
      __shared__ int base_thread_id;
      if (threadIdx.x==0 && threadIdx.y==0 ) {
-        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
        threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
-        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
        int done = 0;
        while (!done) {
-          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
          if(!done) {
            threadid += blockDim.x * blockDim.y;
-            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+            if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
          }
        }
        base_thread_id = threadid;
@ -448,7 +562,8 @@ public:
    }


-    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {

      this-> template exec_team< WorkTag >(
        typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
@ -462,7 +577,7 @@ public:
    if ( m_scratch_size[1]>0 ) {
      __syncthreads();
      if (threadIdx.x==0 && threadIdx.y==0 )
-        kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
+        Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
    }
  }

@ -473,7 +588,7 @@ public:
      const dim3 grid( int(m_league_size) , 1 , 1 );
      const dim3 block( int(m_vector_size) , int(m_team_size) , 1 );

-      CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute

    }

@ -529,6 +644,7 @@ private:
  typedef typename Policy::WorkRange    WorkRange ;
  typedef typename Policy::work_tag     WorkTag ;
  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::launch_bounds LaunchBounds ;

  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
  typedef typename ReducerConditional::type ReducerTypeFwd;
@ -563,6 +679,7 @@ private:
  typedef int DummySHMEMReductionType;

 public:
+  // Make the exec_range calls call to Reduce::DeviceIterateTile
  template< class TagType >
  __device__ inline
  typename std::enable_if< std::is_same< TagType , void >::value >::type
@ -686,7 +803,7 @@ public:

      const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );

-      CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
+      CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute

      Cuda::fence();

@ -737,6 +854,232 @@ public:
  { }
 };

+
+// MDRangePolicy impl
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Cuda
+                    >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
+  typedef typename Policy::array_index_type                 array_index_type;
+  typedef typename Policy::index_type                       index_type;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::launch_bounds LaunchBounds;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef FunctorType                           functor_type ;
+  typedef Cuda::size_type                       size_type ;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ; // used for workrange and nwork
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+  size_type *         m_scratch_space ;
+  size_type *         m_scratch_flags ;
+  size_type *         m_unified_space ;
+
+  typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
+
+  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
+  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
+  // Some crutch to do function overloading
+private:
+  typedef double DummyShflReductionType;
+  typedef int DummySHMEMReductionType;
+
+public:
+  inline
+  __device__
+  void
+  exec_range( reference_type update ) const
+  {
+    Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
+  }
+
+  inline
+  __device__
+  void operator() (void) const {
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+  }
+
+  __device__ inline
+  void run(const DummySHMEMReductionType& ) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
+
+    {
+      reference_type value =
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+
+      // Number of blocks is bounded so that the reduction can be limited to two passes.
+      // Each thread block is given an approximately equal amount of work to perform.
+      // Accumulate the values for this block.
+      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+      this-> exec_range( value );
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    // Problem: non power-of-two blockDim
+    if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>(
+           ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.y == 0 ) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
+    }
+  }
+
+  __device__ inline
+   void run(const DummyShflReductionType&) const
+   {
+
+     value_type value;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
+     // Number of blocks is bounded so that the reduction can be limited to two passes.
+     // Each thread block is given an approximately equal amount of work to perform.
+     // Accumulate the values for this block.
+     // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+     const Member work_part =
+       ( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion of tiles handled by each block
+
+     this-> exec_range( value );
+
+     pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+
+     int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
+     max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
+
+     value_type init;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
+     if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag>
+         (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
+       const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+       if(id==0) {
+         Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
+         *result = value;
+       }
+     }
+   }
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      unsigned n = CudaTraits::WarpSize * 8 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  inline
+  void execute()
+    {
+      const int nwork = m_policy.m_num_tiles;
+      if ( nwork ) {
+        int block_size = m_policy.m_prod_tile_dims;
+        // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
+        // Nearest power of two
+        int exponent_pow_two = std::ceil( std::log2(block_size) );
+        block_size = std::pow(2, exponent_pow_two);
+        int suggested_blocksize = local_block_size( m_functor );
+
+        block_size = (block_size > suggested_blocksize) ? block_size : suggested_blocksize ; //Note: block_size must be less than or equal to 512
+
+
+        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
+        m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+        m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
+
+        // REQUIRED ( 1 , N , 1 )
+        const dim3 block( 1 , block_size , 1 );
+        // Required grid.x <= block.y
+        const dim3 grid( std::min( int(block.y) , int( nwork ) ) , 1 , 1 );
+
+      const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
+
+      CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
+
+      Cuda::fence();
+
+      if ( m_result_ptr ) {
+        if ( m_unified_space ) {
+          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
+          for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+        }
+        else {
+          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
+          DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
+        }
+      }
+    }
+    else {
+      if (m_result_ptr) {
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
+      }
+    }
+  }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const HostViewType & arg_result
+                , typename std::enable_if<
+                   Kokkos::is_view< HostViewType >::value
+                ,void*>::type = NULL)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( InvalidType() )
+  , m_result_ptr( arg_result.ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  {}
+
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ReducerType & reducer)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.view().ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  {}
+};
+
+
 //----------------------------------------------------------------------------

 #if 1
@ -753,6 +1096,7 @@ private:
  typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... >  Policy ;
  typedef typename Policy::member_type  Member ;
  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds     LaunchBounds ;

  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
  typedef typename ReducerConditional::type ReducerTypeFwd;
@ -819,15 +1163,15 @@ public:
    if ( m_scratch_size[1]>0 ) {
      __shared__ int base_thread_id;
      if (threadIdx.x==0 && threadIdx.y==0 ) {
-        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
        threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
-        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
        int done = 0;
        while (!done) {
-          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
          if(!done) {
            threadid += blockDim.x * blockDim.y;
-            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+            if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
          }
        }
        base_thread_id = threadid;
@ -840,7 +1184,7 @@ public:
    if ( m_scratch_size[1]>0 ) {
      __syncthreads();
      if (threadIdx.x==0 && threadIdx.y==0 )
-        kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
+        Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
    }
  }

@ -854,7 +1198,8 @@ public:
      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );

    // Iterate this block through the league
-    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
      this-> template exec_team< WorkTag >
        ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                        , m_shmem_begin
@ -894,7 +1239,8 @@ public:
    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);

    // Iterate this block through the league
-    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
      this-> template exec_team< WorkTag >
        ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                        , m_shmem_begin
@ -936,7 +1282,7 @@ public:
        const dim3 grid( block_count , 1 , 1 );
        const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;

-        CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
+        CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute

        Cuda::fence();

@ -975,12 +1321,6 @@ public:
  , m_shmem_begin( 0 )
  , m_shmem_size( 0 )
  , m_scratch_ptr{NULL,NULL}
-  , m_league_size( arg_policy.league_size() )
-  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
-                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
-                                                               arg_policy.vector_length() )
-  , m_vector_size( arg_policy.vector_length() )
  , m_scratch_size{
    arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
        Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
@ -991,6 +1331,12 @@ public:
                                                                 arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
                                                                 arg_policy.vector_length() )
        )}
+  , m_league_size( arg_policy.league_size() )
+  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+                                                               arg_policy.vector_length() )
+  , m_vector_size( arg_policy.vector_length() )
  {
    // Return Init value if the number of worksets is zero
    if( arg_policy.league_size() == 0) {
@ -1150,6 +1496,7 @@ private:
  typedef typename reducer_type<>::pointer_type    pointer_type ;
  typedef typename reducer_type<>::reference_type  reference_type ;
  typedef typename reducer_type<>::value_type      value_type ;
+  typedef typename Policy::launch_bounds           LaunchBounds ;

  typedef Kokkos::Impl::FunctorAnalysis
    < Kokkos::Impl::FunctorPatternInterface::REDUCE
@ -1273,7 +1620,7 @@ public:
        const int  shmem = m_shmem_team_begin + m_shmem_team_size ;

        // copy to device and execute
-        CudaParallelLaunch<ParallelReduce>( *this, grid, block, shmem );
+        CudaParallelLaunch<ParallelReduce,LaunchBounds>( *this, grid, block, shmem );

        Cuda::fence();

@ -1373,7 +1720,7 @@ public:

    if ( CudaTraits::WarpSize < team_threads ) {
      // Need inter-warp team reduction (collectives) shared memory
-      // Speculate an upper bound for the value size 
+      // Speculate an upper bound for the value size

      m_shmem_team_begin =
        align_scratch( CudaTraits::warp_count(team_threads) * sizeof(double) );
@ -1426,7 +1773,7 @@ public:

    // Reduce space has claim flag followed by vaue buffer
    const int global_reduce_value_size =
-      max_concurrent_block * 
+      max_concurrent_block *
      ( aligned_flag_size + align_scratch( value_size ) );

    // Scratch space has claim flag followed by scratch buffer
@ -1469,6 +1816,7 @@ private:
  typedef typename Policy::member_type  Member ;
  typedef typename Policy::work_tag     WorkTag ;
  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::launch_bounds  LaunchBounds ;

  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
@ -1655,10 +2003,10 @@ public:
        const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );

        m_final = false ;
-        CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+        CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute

        m_final = true ;
-        CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+        CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
      }
    }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -151,7 +151,7 @@ template< class ValueType , class JoinOp>
 __device__
 inline void cuda_intra_warp_reduction( ValueType& result,
                                       const JoinOp& join,
-                                       const int max_active_thread = blockDim.y) {
+                                       const uint32_t max_active_thread = blockDim.y) {

  unsigned int shift = 1;

@ -268,29 +268,33 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
        if( id + 1 < int(gridDim.x) )
          join(value, tmp);
      }
+      int active = __ballot(1);
      if (int(blockDim.x*blockDim.y) > 2) {
        value_type tmp = Kokkos::shfl_down(value, 2,32);
        if( id + 2 < int(gridDim.x) )
          join(value, tmp);
      }
+      active += __ballot(1);
      if (int(blockDim.x*blockDim.y) > 4) {
        value_type tmp = Kokkos::shfl_down(value, 4,32);
        if( id + 4 < int(gridDim.x) )
          join(value, tmp);
      }
+      active += __ballot(1);
      if (int(blockDim.x*blockDim.y) > 8) {
        value_type tmp = Kokkos::shfl_down(value, 8,32);
        if( id + 8 < int(gridDim.x) )
          join(value, tmp);
      }
+      active += __ballot(1);
      if (int(blockDim.x*blockDim.y) > 16) {
        value_type tmp = Kokkos::shfl_down(value, 16,32);
        if( id + 16 < int(gridDim.x) )
          join(value, tmp);
      }
+      active += __ballot(1);
    }
  }
-
  //The last block has in its thread=0 the global reduction value through "value"
  return last_block;
 #else
@ -302,7 +306,7 @@ template< class ReducerType >
 __device__ inline
 typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
 cuda_intra_warp_reduction( const ReducerType& reducer,
-                           const int max_active_thread = blockDim.y) {
+                           const uint32_t max_active_thread = blockDim.y) {

  typedef typename ReducerType::value_type ValueType;

@ -428,26 +432,31 @@ cuda_inter_block_reduction( const ReducerType& reducer,
        if( id + 1 < int(gridDim.x) )
          reducer.join(value, tmp);
      }
+      int active = __ballot(1);
      if (int(blockDim.x*blockDim.y) > 2) {
        value_type tmp = Kokkos::shfl_down(value, 2,32);
        if( id + 2 < int(gridDim.x) )
          reducer.join(value, tmp);
      }
+      active += __ballot(1);
      if (int(blockDim.x*blockDim.y) > 4) {
        value_type tmp = Kokkos::shfl_down(value, 4,32);
        if( id + 4 < int(gridDim.x) )
          reducer.join(value, tmp);
      }
+      active += __ballot(1);
      if (int(blockDim.x*blockDim.y) > 8) {
        value_type tmp = Kokkos::shfl_down(value, 8,32);
        if( id + 8 < int(gridDim.x) )
          reducer.join(value, tmp);
      }
+      active += __ballot(1);
      if (int(blockDim.x*blockDim.y) > 16) {
        value_type tmp = Kokkos::shfl_down(value, 16,32);
        if( id + 16 < int(gridDim.x) )
          reducer.join(value, tmp);
      }
+      active += __ballot(1);
    }
  }

@ -594,7 +603,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;

  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  //typedef typename ValueTraits::reference_type  reference_type ;

  // '__ffs' = position of the least significant bit set to 1.
  // 'blockDim.y' is guaranteed to be a power of two so this
@ -637,7 +646,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,

    {
      void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
-      reference_type shared_value = ValueInit::init( functor , shared_ptr );
+      /* reference_type shared_value = */ ValueInit::init( functor , shared_ptr );

      for ( size_type i = b ; i < e ; ++i ) {
        ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@ -58,25 +58,56 @@ template class TaskQueue< Kokkos::Cuda > ;

 //----------------------------------------------------------------------------

+#if defined( KOKKOS_DEBUG )
+
+__device__
+void verify_warp_convergence( const char * const where )
+{
+  const unsigned b = __ballot(1);
+
+  if ( b != ~0u ) {
+
+printf(" verify_warp_convergence( %s ) (%d,%d,%d) (%d,%d,%d) failed %x\n"
+      , where
+      , blockIdx.x
+      , blockIdx.y
+      , blockIdx.z
+      , threadIdx.x
+      , threadIdx.y
+      , threadIdx.z
+      , b );
+
+  }
+}
+
+#endif // #if defined( KOKKOS_DEBUG )
+
+//----------------------------------------------------------------------------
+
 __device__
 void TaskQueueSpecialization< Kokkos::Cuda >::driver
-  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
+  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue 
+  , int32_t shmem_per_warp )
 {
  using Member = TaskExec< Kokkos::Cuda > ;
  using Queue  = TaskQueue< Kokkos::Cuda > ;
-  using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
+  using task_root_type = TaskBase< void , void , void > ;
+
+  extern __shared__ int32_t shmem_all[];

  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;

-  Member single_exec( 1 );
-  Member team_exec( blockDim.y );
+  int32_t * const warp_shmem =
+    shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
+
+  task_root_type * const task_shmem = (task_root_type *) warp_shmem ;

  const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;

-  union {
-    task_root_type * ptr ;
-    int              raw[2] ;
-  } task ;
+  Member single_exec( warp_shmem , 1 );
+  Member team_exec( warp_shmem , blockDim.y );
+
+  task_root_type * task_ptr ;

  // Loop until all queues are empty and no tasks in flight

@ -87,41 +118,86 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver

    if ( 0 == warp_lane ) {

-      task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+      task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;

      // Loop by priority and then type
-      for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
-        for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
-          task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
+      for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
+          task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
        }
      }

 #if 0
 printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
-      , uintptr_t(task.ptr));
+      , uintptr_t(task_ptr));
 #endif

    }

    // shuffle broadcast

-    task.raw[0] = __shfl( task.raw[0] , 0 );
-    task.raw[1] = __shfl( task.raw[1] , 0 );
+    ((int*) & task_ptr )[0] = __shfl( ((int*) & task_ptr )[0] , 0 );
+    ((int*) & task_ptr )[1] = __shfl( ((int*) & task_ptr )[1] , 0 );

-    if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
+#if defined( KOKKOS_DEBUG )
+    verify_warp_convergence("task_ptr");
+#endif

-    if ( end != task.ptr ) {
-      if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
+    if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
+
+    if ( end != task_ptr ) {
+
+      // Whole warp copy task's closure to/from shared memory.
+      // Use all threads of warp for coalesced read/write.
+
+      int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
+      int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
+
+      int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
+
+      // copy global to shared memory:
+
+      for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
+        warp_shmem[i] = task_mem[i] ;
+      }
+
+      Kokkos::memory_fence();
+
+      // Copy done - use memory fence so that memory writes are visible.
+      // For reliable warp convergence on Pascal and Volta an explicit
+      // warp level synchronization will also be required.
+
+      if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
        // Thread Team Task
-        (*task.ptr->m_apply)( task.ptr , & team_exec );
+        (*task_shmem->m_apply)( task_shmem , & team_exec );
      }
      else if ( 0 == threadIdx.y ) {
        // Single Thread Task
-        (*task.ptr->m_apply)( task.ptr , & single_exec );
+        (*task_shmem->m_apply)( task_shmem , & single_exec );
      }

+      // copy shared to global memory:
+
+      for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
+        task_mem[i] = warp_shmem[i] ;
+      }
+
+      Kokkos::memory_fence();
+
+#if defined( KOKKOS_DEBUG )
+    verify_warp_convergence("apply");
+#endif
+
+      // If respawn requested copy respawn data back to main memory
+
      if ( 0 == warp_lane ) {
-        queue->complete( task.ptr );
+
+        if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
+          ( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
+          ( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
+        }
+
+        queue->complete( task_ptr );
      }
    }
  } while(1);
@ -130,18 +206,20 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
 namespace {

 __global__
-void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
-{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
+void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue 
+                            , int32_t shmem_size )
+{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }

 }

 void TaskQueueSpecialization< Kokkos::Cuda >::execute
  ( TaskQueue< Kokkos::Cuda > * const queue )
 {
+  const int shared_per_warp = 2048 ;
  const int warps_per_block = 4 ;
  const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
  const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
-  const int shared = 0 ;
+  const int shared_total = shared_per_warp * warps_per_block ;
  const cudaStream_t stream = 0 ;

  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
@ -159,7 +237,7 @@ printf("cuda_task_queue_execute before\n");
  //
  // CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );

-  cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
+  cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );

  CUDA_SAFE_CALL( cudaGetLastError() );

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@ -57,7 +57,7 @@ namespace {
 template< typename TaskType >
 __global__
 void set_cuda_task_base_apply_function_pointer
-  ( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
+  ( TaskBase<void,void,void>::function_type * ptr )
 { *ptr = TaskType::apply ; }

 }
@ -78,7 +78,7 @@ public:
  void iff_single_thread_recursive_execute( queue_type * const ) {}

  __device__
-  static void driver( queue_type * const );
+  static void driver( queue_type * const , int32_t );

  static
  void execute( queue_type * const );
@ -106,7 +106,14 @@ public:

 extern template class TaskQueue< Kokkos::Cuda > ;

+}} /* namespace Kokkos::Impl */
+
 //----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
 /**\brief  Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
 *         passed to tasks running in a Cuda space.
 *
@ -134,11 +141,13 @@ private:
  friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;

+  int32_t * m_team_shmem ;
  const int m_team_size ;

  __device__
-  TaskExec( int arg_team_size = blockDim.y )
-    : m_team_size( arg_team_size ) {}
+  TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
+    : m_team_shmem( arg_team_shmem )
+    , m_team_size( arg_team_size ) {}

 public:

@ -154,7 +163,13 @@ public:

 };

+}} /* namespace Kokkos::Impl */
+
 //----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {

 template<typename iType>
 struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@ -106,7 +106,7 @@ private:
  typedef Kokkos::Cuda                           execution_space ;
  typedef execution_space::scratch_memory_space  scratch_memory_space ;

-  void                * m_team_reduce ;
+  mutable void        * m_team_reduce ;
  scratch_memory_space  m_team_shared ;
  int                   m_team_reduce_size ;
  int                   m_league_rank ;
@ -166,7 +166,7 @@ public:
      if ( 1 == blockDim.z ) { // team == block
        __syncthreads();
        // Wait for shared data write until all threads arrive here
-        if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
+        if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
          *((ValueType*) m_team_reduce) = val ;
        }
        __syncthreads(); // Wait for shared data read until root thread writes
@ -210,7 +210,7 @@ public:
      const int wx =
        ( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;

-      for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
+      for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {

        cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );

@ -354,7 +354,7 @@ public:

      for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
        cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
-        if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
+        if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
      }

      // Broadcast from root lane to all other lanes.
@ -410,7 +410,7 @@ public:

        value_type tmp( reducer.reference() );

-        for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
+        for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {

          cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );

@ -479,7 +479,7 @@ public:

          __threadfence(); // Wait until global write is visible.

-          last_block = gridDim.x ==
+          last_block = (int)gridDim.x ==
                       1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);

          // If last block then reset count
@ -509,7 +509,7 @@ public:
        reducer.copy( ((pointer_type)shmem) + offset
                    , ((pointer_type)global_scratch_space) + offset );

-        for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
+        for ( int i = nentry + tid ; i < (int)gridDim.x ; i += nentry ) {
          reducer.join( ((pointer_type)shmem) + offset
                      , ((pointer_type)global_scratch_space)
                        + i * reducer.length() );
@ -576,6 +576,14 @@ public:
    , m_league_size( arg_league_size )
    {}

+public:
+  // Declare to avoid unused private member warnings which are trigger
+  // when SFINAE excludes the member function which uses these variables
+  // Making another class a friend also surpresses these warnings
+  bool impl_avoid_sfinae_warning() const noexcept
+  {
+    return m_team_reduce_size > 0 && m_team_reduce != nullptr;
+  }
 };

 } // namspace Impl
@ -913,10 +921,10 @@ void parallel_scan
    //  [t] += [t-4] if t >= 4
    //  ...

-    for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
+    for ( int j = 1 ; j < (int)blockDim.x ; j <<= 1 ) {
      value_type tmp = 0 ;
      Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
-      if ( j <= threadIdx.x ) { sval += tmp ; }
+      if ( j <= (int)threadIdx.x ) { sval += tmp ; }
    }

    // Include accumulation and remove value for exclusive scan:
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
@ -0,0 +1,133 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+#define KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <Kokkos_CudaSpace.hpp>
+#include <Kokkos_UniqueToken.hpp>
+#include <impl/Kokkos_SharedAlloc.hpp>
+#include <impl/Kokkos_ConcurrentBitset.hpp>
+
+namespace Kokkos { namespace Experimental {
+
+// both global and instance Unique Tokens are implemented in the same way
+template<>
+class UniqueToken< Cuda, UniqueTokenScope::Global >
+{
+private:
+
+  uint32_t volatile * m_buffer ;
+  uint32_t            m_count ;
+
+public:
+
+  using execution_space = Cuda;
+
+  explicit
+  UniqueToken( execution_space const& );
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken() : m_buffer(0), m_count(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken( const UniqueToken & ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken( UniqueToken && )      = default;
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken & operator=( const UniqueToken & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken & operator=( UniqueToken && ) = default ;
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int32_t size() const noexcept { return m_count ; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int32_t acquire() const
+  {
+    const Kokkos::pair<int,int> result =
+      Kokkos::Impl::concurrent_bitset::
+        acquire_bounded( m_buffer
+                       , m_count
+                       , Kokkos::Impl::clock_tic() % m_count
+                       );
+
+   if ( result.first < 0 ) {
+     Kokkos::abort("UniqueToken<Cuda> failure to release tokens, no tokens available" );
+   }
+
+    return result.first;
+  }
+
+  /// \brief release an acquired value
+  KOKKOS_INLINE_FUNCTION
+  void release( int32_t i ) const noexcept
+  {
+    Kokkos::Impl::concurrent_bitset::release( m_buffer, i );
+  }
+};
+
+template<>
+class UniqueToken< Cuda, UniqueTokenScope::Instance >
+  : public UniqueToken< Cuda, UniqueTokenScope::Global >
+{
+public:
+
+  explicit
+  UniqueToken( execution_space const& arg )
+    : UniqueToken< Cuda, UniqueTokenScope::Global >( arg ) {}
+};
+
+}} // namespace Kokkos::Experimental
+
+#endif // KOKKOS_ENABLE_CUDA
+#endif // KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -221,7 +221,6 @@ struct CudaLDGFetch {
 //----------------------------------------------------------------------------

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

 /** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
@ -294,9 +293,8 @@ public:
    }
 };

-}
-}
-}
+} // namespace Impl
+} // namespace Kokkos

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
+#define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::Cuda
+                 >
+  : public Kokkos::Impl::Experimental::
+           WorkGraphExec< FunctorType,
+                          Kokkos::Cuda,
+                          Traits ...
+                        >
+{
+public:
+
+  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... >   Policy ;
+  typedef Kokkos::Impl::Experimental::
+          WorkGraphExec<FunctorType, Kokkos::Cuda, Traits ... > Base ;
+  typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda>        Self ;
+
+private:
+
+  template< class TagType >
+  __device__
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    Base::m_functor( i );
+  }
+
+  template< class TagType >
+  __device__
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    const TagType t{} ;
+    Base::m_functor( t , i );
+  }
+
+public:
+
+  __device__
+  inline
+  void operator()() const {
+    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
+      exec_one< typename Policy::work_tag >( i );
+      Base::after_work(i);
+    }
+  }
+
+  inline
+  void execute()
+  {
+    const int warps_per_block = 4 ;
+    const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
+    const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
+    const int shared = 0 ;
+    const cudaStream_t stream = 0 ;
+
+    Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, stream);
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : Base( arg_functor, arg_policy )
+  {
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP */
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@ -52,6 +52,7 @@

 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
 #include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
+#include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
 #endif

 namespace Kokkos { namespace Experimental {
@ -120,28 +121,17 @@ struct MDRangePolicy
                                       , typename traits::index_type
                                       > ;

+  typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation
+
  static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
               , "Kokkos Error: MD iteration pattern not defined" );

  using iteration_pattern   = typename traits::iteration_pattern;
  using work_tag            = typename traits::work_tag;
+  using launch_bounds       = typename traits::launch_bounds;
+  using member_type = typename range_policy::member_type;

-  static constexpr int rank = iteration_pattern::rank;
-
-  static constexpr int outer_direction = static_cast<int> (
-      (iteration_pattern::outer_direction != Iterate::Default)
-    ? iteration_pattern::outer_direction
-    : default_outer_direction< typename traits::execution_space>::value );
-
-  static constexpr int inner_direction = static_cast<int> (
-      iteration_pattern::inner_direction != Iterate::Default
-    ? iteration_pattern::inner_direction
-    : default_inner_direction< typename traits::execution_space>::value ) ;
-
-
-  // Ugly ugly workaround intel 14 not handling scoped enum correctly
-  static constexpr int Right = static_cast<int>( Iterate::Right );
-  static constexpr int Left  = static_cast<int>( Iterate::Left );
+  enum { rank = static_cast<int>(iteration_pattern::rank) };

  using index_type  = typename traits::index_type;
  using array_index_type = long;
@ -155,11 +145,50 @@ struct MDRangePolicy
  // This would require the user to either pass a matching index_type parameter
  // as template parameter to the MDRangePolicy or static_cast the individual values

+  point_type m_lower;
+  point_type m_upper;
+  tile_type  m_tile;
+  point_type m_tile_end;
+  index_type m_num_tiles;
+  index_type m_prod_tile_dims;
+
+/*
+  // NDE enum impl definition alternative - replace static constexpr int ? 
+  enum { outer_direction = static_cast<int> (
+      (iteration_pattern::outer_direction != Iterate::Default)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename traits::execution_space>::value ) };
+
+  enum { inner_direction = static_cast<int> (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename traits::execution_space>::value ) };
+
+  enum { Right = static_cast<int>( Iterate::Right ) };
+  enum { Left  = static_cast<int>( Iterate::Left ) };
+*/
+  //static constexpr int rank = iteration_pattern::rank;
+
+  static constexpr int outer_direction = static_cast<int> (
+      (iteration_pattern::outer_direction != Iterate::Default)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename traits::execution_space>::value );
+
+  static constexpr int inner_direction = static_cast<int> (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename traits::execution_space>::value ) ;
+
+  // Ugly ugly workaround intel 14 not handling scoped enum correctly
+  static constexpr int Right = static_cast<int>( Iterate::Right );
+  static constexpr int Left  = static_cast<int>( Iterate::Left );
+
  MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
    : m_lower(lower)
    , m_upper(upper)
    , m_tile(tile)
    , m_num_tiles(1)
+    , m_prod_tile_dims(1)
  {
    // Host
    if ( true
@ -172,8 +201,8 @@ struct MDRangePolicy
      for (int i=0; i<rank; ++i) {
        span = upper[i] - lower[i];
        if ( m_tile[i] <= 0 ) {
-          if (  (inner_direction == Right && (i < rank-1))
-              || (inner_direction == Left && (i > 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
          {
            m_tile[i] = 2;
          }
@ -183,6 +212,7 @@ struct MDRangePolicy
        }
        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
        m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
      }
    }
    #if defined(KOKKOS_ENABLE_CUDA)
@ -190,14 +220,18 @@ struct MDRangePolicy
    {
      index_type span;
      for (int i=0; i<rank; ++i) {
-        span = upper[i] - lower[i];
+        span = m_upper[i] - m_lower[i];
        if ( m_tile[i] <= 0 ) {
          // TODO: determine what is a good default tile size for cuda
          // may be rank dependent
-          if (  (inner_direction == Right && (i < rank-1))
-              || (inner_direction == Left && (i > 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
          {
-            m_tile[i] = 2;
+            if ( m_prod_tile_dims < 512 ) {
+              m_tile[i] = 2;
+            } else {
+              m_tile[i] = 1;
+            }
          }
          else {
            m_tile[i] = 16;
@ -205,12 +239,9 @@ struct MDRangePolicy
        }
        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
        m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
      }
-      index_type total_tile_size_check = 1;
-      for (int i=0; i<rank; ++i) {
-        total_tile_size_check *= m_tile[i];
-      }
-      if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+      if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
        printf(" Tile dimensions exceed Cuda limits\n");
        Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
@ -223,19 +254,7 @@ struct MDRangePolicy
  template < typename LT , typename UT , typename TT = array_index_type >
  MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
  {
-#if 0
-    // This should work, less duplicated code but not yet extensively tested
-    point_type lower_tmp, upper_tmp;
-    tile_type tile_tmp;
-    for ( auto i = 0; i < rank; ++i ) {
-      lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]);
-      upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]);
-      tile_tmp[i]  = static_cast<array_index_type>(tile.begin()[i]);
-    }

-    MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
-
-#else
    if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
      Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");

@ -249,7 +268,7 @@ struct MDRangePolicy
    }

    m_num_tiles = 1;
-
+    m_prod_tile_dims = 1;

    // Host
    if ( true
@ -262,8 +281,8 @@ struct MDRangePolicy
      for (int i=0; i<rank; ++i) {
        span = m_upper[i] - m_lower[i];
        if ( m_tile[i] <= 0 ) {
-          if (  (inner_direction == Right && (i < rank-1))
-              || (inner_direction == Left && (i > 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
          {
            m_tile[i] = 2;
          }
@ -273,6 +292,7 @@ struct MDRangePolicy
        }
        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
        m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
      }
    }
    #if defined(KOKKOS_ENABLE_CUDA)
@ -284,10 +304,14 @@ struct MDRangePolicy
        if ( m_tile[i] <= 0 ) {
          // TODO: determine what is a good default tile size for cuda
          // may be rank dependent
-          if (  (inner_direction == Right && (i < rank-1))
-              || (inner_direction == Left && (i > 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
          {
-            m_tile[i] = 2;
+            if ( m_prod_tile_dims < 512 ) {
+              m_tile[i] = 2;
+            } else {
+              m_tile[i] = 1;
+            }
          }
          else {
            m_tile[i] = 16;
@ -295,32 +319,22 @@ struct MDRangePolicy
        }
        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
        m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
      }
-      index_type total_tile_size_check = 1;
-      for (int i=0; i<rank; ++i) {
-        total_tile_size_check *= m_tile[i];
-      }
-      if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+      if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
        printf(" Tile dimensions exceed Cuda limits\n");
        Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
      }
    }
    #endif
-#endif
  }

-
-  point_type m_lower;
-  point_type m_upper;
-  tile_type  m_tile;
-  point_type m_tile_end;
-  index_type m_num_tiles;
 };
 // ------------------------------------------------------------------ //

 // ------------------------------------------------------------------ //
-//md_parallel_for
+//md_parallel_for - deprecated use parallel_for
 // ------------------------------------------------------------------ //
 template <typename MDRange, typename Functor, typename Enable = void>
 void md_parallel_for( MDRange const& range
@ -335,7 +349,6 @@ void md_parallel_for( MDRange const& range
 {
  Impl::MDFunctor<MDRange, Functor, void> g(range, f);

-  //using range_policy = typename MDRange::range_policy;
  using range_policy = typename MDRange::impl_range_policy;

  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
@ -354,7 +367,6 @@ void md_parallel_for( const std::string& str
 {
  Impl::MDFunctor<MDRange, Functor, void> g(range, f);

-  //using range_policy = typename MDRange::range_policy;
  using range_policy = typename MDRange::impl_range_policy;

  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
@ -395,7 +407,7 @@ void md_parallel_for( MDRange const& range
 // ------------------------------------------------------------------ //

 // ------------------------------------------------------------------ //
-//md_parallel_reduce
+//md_parallel_reduce - deprecated use parallel_reduce
 // ------------------------------------------------------------------ //
 template <typename MDRange, typename Functor, typename ValueType>
 void md_parallel_reduce( MDRange const& range
@ -409,9 +421,8 @@ void md_parallel_reduce( MDRange const& range
                      ) >::type* = 0
                    )
 {
-  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
+  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);

-  //using range_policy = typename MDRange::range_policy;
  using range_policy = typename MDRange::impl_range_policy;
  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
 }
@ -428,48 +439,14 @@ void md_parallel_reduce( const std::string& str
                      ) >::type* = 0
                    )
 {
-  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
+  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);

-  //using range_policy = typename MDRange::range_policy;
  using range_policy = typename MDRange::impl_range_policy;

  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
 }

-// Cuda - parallel_reduce not implemented yet
-/*
-template <typename MDRange, typename Functor, typename ValueType>
-void md_parallel_reduce( MDRange const& range
-                    , Functor const& f
-                    , ValueType & v
-                    , const std::string& str = ""
-                    , typename std::enable_if<( true
-                      #if defined( KOKKOS_ENABLE_CUDA)
-                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
-                      #endif
-                      ) >::type* = 0
-                    )
-{
-  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
-  closure.execute();
-}
-
-template <typename MDRange, typename Functor, typename ValueType>
-void md_parallel_reduce( const std::string& str
-                    , MDRange const& range
-                    , Functor const& f
-                    , ValueType & v
-                    , typename std::enable_if<( true
-                      #if defined( KOKKOS_ENABLE_CUDA)
-                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
-                      #endif
-                      ) >::type* = 0
-                    )
-{
-  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
-  closure.execute();
-}
-*/
+// Cuda - md_parallel_reduce not implemented - use parallel_reduce

 }} // namespace Kokkos::Experimental

--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@ -114,40 +114,9 @@
 #endif /* Not pre-selected atomic implementation */
 #endif

-//----------------------------------------------------------------------------
-
-// Forward decalaration of functions supporting arbitrary sized atomics
-// This is necessary since Kokkos_Atomic.hpp is internally included very early
-// through Kokkos_HostSpace.hpp as well as the allocation tracker.
 #ifdef KOKKOS_ENABLE_CUDA
-namespace Kokkos {
-namespace Impl {
-/// \brief Aquire a lock for the address
-///
-/// This function tries to aquire the lock for the hash value derived
-/// from the provided ptr. If the lock is successfully aquired the
-/// function returns true. Otherwise it returns false.
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-extern
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #endif
-__device__ inline
-bool lock_address_cuda_space(void* ptr);
-
-/// \brief Release lock for the address
-///
-/// This function releases the lock for the hash value derived
-/// from the provided ptr. This function should only be called
-/// after previously successfully aquiring a lock with
-/// lock_address.
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-extern
-#endif
-__device__ inline
-void unlock_address_cuda_space(void* ptr);
-}
-}
-#endif
-

 namespace Kokkos {
 template <typename T>
--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@ -79,6 +79,21 @@ struct IndexType
  using type = T;
 };

+/**\brief Specify Launch Bounds for CUDA execution.
+ *
+ *  The "best" defaults may be architecture specific.
+ */
+template< unsigned int maxT = 1024 /* Max threads per block */
+        , unsigned int minB = 1    /* Min blocks per SM */
+        >
+struct LaunchBounds
+{
+  using launch_bounds = LaunchBounds;
+  using type = LaunchBounds<maxT,minB>;
+  static unsigned int constexpr maxTperB {maxT};
+  static unsigned int constexpr minBperSM {minB};
+};
+
 } // namespace Kokkos

 //----------------------------------------------------------------------------
@ -119,6 +134,7 @@ using Kokkos::is_array_layout ;
 KOKKOS_IMPL_IS_CONCEPT( iteration_pattern )
 KOKKOS_IMPL_IS_CONCEPT( schedule_type )
 KOKKOS_IMPL_IS_CONCEPT( index_type )
+KOKKOS_IMPL_IS_CONCEPT( launch_bounds )

 }

--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -96,11 +96,13 @@ struct InitArguments {
  int num_numa;
  int device_id;

-  InitArguments() {
-    num_threads = -1;
-    num_numa = -1;
-    device_id = -1;
-  }
+  InitArguments( int nt = -1
+               , int nn = -1
+               , int dv = -1)
+    : num_threads( nt )
+    , num_numa( nn )
+    , device_id( dv )
+  {}
 };

 void initialize(int& narg, char* arg[]);
@ -168,6 +170,9 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )

 } // namespace Kokkos

+#include <Kokkos_Crs.hpp>
+#include <Kokkos_WorkGraphPolicy.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@ -51,6 +51,9 @@
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_Utilities.hpp>

+#include <Kokkos_UniqueToken.hpp>
+#include <Kokkos_MasterLock.hpp>
+
 //----------------------------------------------------------------------------
 // Have assumed a 64bit build (8byte pointers) throughout the code base.

--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@ -0,0 +1,333 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CRS_HPP
+#define KOKKOS_CRS_HPP
+
+namespace Kokkos {
+namespace Experimental {
+
+/// \class Crs
+/// \brief Compressed row storage array.
+///
+/// \tparam DataType The type of stored entries.  If a Crs is
+///   used as the graph of a sparse matrix, then this is usually an
+///   integer type, the type of the column indices in the sparse
+///   matrix.
+///
+/// \tparam Arg1Type The second template parameter, corresponding
+///   either to the Device type (if there are no more template
+///   parameters) or to the Layout type (if there is at least one more
+///   template parameter).
+///
+/// \tparam Arg2Type The third template parameter, which if provided
+///   corresponds to the Device type.
+///
+/// \tparam SizeType The type of row offsets.  Usually the default
+///   parameter suffices.  However, setting a nondefault value is
+///   necessary in some cases, for example, if you want to have a
+///   sparse matrices with dimensions (and therefore column indices)
+///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
+///   entries in the sparse matrix.
+///
+/// A row has a range of entries:
+/// <ul>
+/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
+/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
+/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
+/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
+/// </ul>
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type = void,
+          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
+class Crs {
+protected:
+  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
+
+public:
+  typedef DataType                                            data_type;
+  typedef typename traits::array_layout                       array_layout;
+  typedef typename traits::execution_space                    execution_space;
+  typedef typename traits::memory_space                       memory_space;
+  typedef typename traits::device_type                        device_type;
+  typedef SizeType                                            size_type;
+
+  typedef Crs< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
+  typedef Crs< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror;
+  typedef View<size_type* , array_layout, device_type> row_map_type;
+  typedef View<DataType*  , array_layout, device_type> entries_type;
+
+  entries_type entries;
+  row_map_type row_map;
+
+  //! Construct an empty view.
+  Crs () : entries(), row_map() {}
+
+  //! Copy constructor (shallow copy).
+  Crs (const Crs& rhs) : entries (rhs.entries), row_map (rhs.row_map)
+  {}
+
+  template<class EntriesType, class RowMapType>
+  Crs (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
+  {}
+
+  /** \brief  Assign to a view of the rhs array.
+   *          If the old view is the last view
+   *          then allocated memory is deallocated.
+   */
+  Crs& operator= (const Crs& rhs) {
+    entries = rhs.entries;
+    row_map = rhs.row_map;
+    return *this;
+  }
+
+  /**  \brief  Destroy this view of the array.
+   *           If the last view then allocated memory is deallocated.
+   */
+  ~Crs() {}
+
+  /**  \brief  Return number of rows in the graph
+   */
+  KOKKOS_INLINE_FUNCTION
+  size_type numRows() const {
+    return (row_map.dimension_0 () != 0) ?
+      row_map.dimension_0 () - static_cast<size_type> (1) :
+      static_cast<size_type> (0);
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class OutCounts,
+          class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void get_crs_transpose_counts(
+    OutCounts& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in,
+    std::string const& name = "transpose_counts");
+
+template< class OutCounts,
+          class InCrs>
+void get_crs_row_map_from_counts(
+    OutCounts& out,
+    InCrs const& in,
+    std::string const& name = "row_map");
+
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void transpose_crs(
+    Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in);
+
+}} // namespace Kokkos::Experimental
+
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace Experimental {
+
+template <class InCrs, class OutCounts>
+class GetCrsTransposeCounts {
+ public:
+  using execution_space = typename InCrs::execution_space;
+  using self_type = GetCrsTransposeCounts<InCrs, OutCounts>;
+  using index_type = typename InCrs::size_type;
+ private:
+  InCrs in;
+  OutCounts out;
+ public:
+  KOKKOS_INLINE_FUNCTION
+  void operator()(index_type i) const {
+    atomic_increment( &out[in.entries(i)] );
+  }
+  GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out):
+    in(arg_in),out(arg_out) {
+    using policy_type = RangePolicy<index_type, execution_space>;
+    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+    const closure_type closure(*this, policy_type(0, index_type(in.entries.size())));
+    closure.execute();
+    execution_space::fence();
+  }
+};
+
+template <class InCounts, class OutRowMap>
+class CrsRowMapFromCounts {
+ public:
+  using execution_space = typename InCounts::execution_space;
+  using value_type = typename OutRowMap::value_type;
+  using index_type = typename InCounts::size_type;
+ private:
+  InCounts in;
+  OutRowMap out;
+ public:
+  KOKKOS_INLINE_FUNCTION
+  void operator()(index_type i, value_type& update, bool final_pass) const {
+    update += in(i);
+    if (final_pass) {
+      out(i + 1) = update;
+      if (i == 0) {
+        out(0) = 0;
+      }
+    }
+  }
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type& update) const { update = 0; }
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& update, const volatile value_type& input) const {
+    update += input;
+  }
+  using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>;
+  CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out):
+    in(arg_in),out(arg_out) {
+    using policy_type = RangePolicy<index_type, execution_space>;
+    using closure_type = Kokkos::Impl::ParallelScan<self_type, policy_type>;
+    closure_type closure(*this, policy_type(0, in.size()));
+    closure.execute();
+    execution_space::fence();
+  }
+};
+
+template <class InCrs, class OutCrs>
+class FillCrsTransposeEntries {
+ public:
+  using execution_space = typename InCrs::execution_space;
+  using memory_space = typename InCrs::memory_space;
+  using value_type = typename OutCrs::entries_type::value_type;
+  using index_type = typename InCrs::size_type;
+ private:
+  using counters_type = View<index_type*, memory_space>;
+  InCrs in;
+  OutCrs out;
+  counters_type counters;
+ public:
+  KOKKOS_INLINE_FUNCTION
+  void operator()(index_type i) const {
+    auto begin = in.row_map(i);
+    auto end = in.row_map(i + 1);
+    for (auto j = begin; j < end; ++j) {
+      auto ti = in.entries(j);
+      auto tbegin = out.row_map(ti);
+      auto tj = atomic_fetch_add( &counters(ti), 1 );
+      out.entries( tbegin + tj ) = i;
+    }
+  }
+  using self_type = FillCrsTransposeEntries<InCrs, OutCrs>;
+  FillCrsTransposeEntries(InCrs const& arg_in, OutCrs const& arg_out):
+    in(arg_in),out(arg_out),
+    counters("counters", arg_out.numRows()) {
+    using policy_type = RangePolicy<index_type, execution_space>;
+    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+    const closure_type closure(*this, policy_type(0, index_type(in.numRows())));
+    closure.execute();
+    execution_space::fence();
+  }
+};
+
+}}} // namespace Kokkos::Impl::Experimental
+
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class OutCounts,
+          class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void get_crs_transpose_counts(
+    OutCounts& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in,
+    std::string const& name) {
+  using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>;
+  out = OutCounts(name, in.numRows());
+  Kokkos::Impl::Experimental::
+    GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
+}
+
+template< class OutRowMap,
+          class InCounts>
+void get_crs_row_map_from_counts(
+    OutRowMap& out,
+    InCounts const& in,
+    std::string const& name) {
+  out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
+  Kokkos::Impl::Experimental::
+    CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
+}
+
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void transpose_crs(
+    Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in)
+{
+  typedef Crs<DataType, Arg1Type, Arg2Type, SizeType> crs_type ;
+  typedef typename crs_type::memory_space             memory_space ;
+  typedef View<SizeType*, memory_space>               counts_type ;
+  {
+  counts_type counts;
+  Kokkos::Experimental::get_crs_transpose_counts(counts, in);
+  Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts,
+      "tranpose_row_map");
+  }
+  out.entries = decltype(out.entries)("transpose_entries", in.entries.size());
+  Kokkos::Impl::Experimental::
+    FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
+}
+
+}} // namespace Kokkos::Experimental
+
+#endif /* #define KOKKOS_CRS_HPP */
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@ -217,8 +217,8 @@ public:

 private:

-  cudaStream_t m_stream ;
  int          m_device ;
+  cudaStream_t m_stream ;
 };

 } // namespace Kokkos
@ -295,6 +295,7 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Cuda/Kokkos_Cuda_Parallel.hpp>
 #include <Cuda/Kokkos_Cuda_Task.hpp>
+#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>

 #include <KokkosExp_MDRangePolicy.hpp>
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@ -90,7 +90,7 @@ public:
                 , const size_t arg_alloc_size ) const ;

  /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return m_name; }

  /*--------------------------------*/
  /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
@ -186,7 +186,7 @@ public:
                 , const size_t arg_alloc_size ) const ;

  /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return m_name; }

  /*--------------------------------*/

@ -234,7 +234,7 @@ public:
                 , const size_t arg_alloc_size ) const ;

  /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return m_name; }

 private:

--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@ -384,6 +384,7 @@ Impl::PerThreadValue PerThread(const int& arg);
 *    WorkTag (none): Tag which is used as the first argument for the functor operator.
 *    Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
 *    IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
+ *    LaunchBounds<int,int> (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation.
 */
 template< class ... Properties>
 class TeamPolicy: public
@ -561,6 +562,45 @@ KOKKOS_INLINE_FUNCTION
 Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType>
 ThreadVectorRange( const TeamMemberType&, const iType& count );

+#if defined(KOKKOS_ENABLE_PROFILING)
+namespace Impl {
+
+template<typename FunctorType, typename TagType,
+  bool HasTag = !std::is_same<TagType, void>::value >
+struct ParallelConstructName;
+
+template<typename FunctorType, typename TagType>
+struct ParallelConstructName<FunctorType, TagType, true> {
+  ParallelConstructName(std::string const& label):label_ref(label) {
+    if (label.empty()) {
+      default_name = std::string(typeid(FunctorType).name()) + "/" +
+        typeid(TagType).name();
+    }
+  }
+  std::string const& get() {
+    return (label_ref.empty()) ? default_name : label_ref;
+  }
+  std::string const& label_ref;
+  std::string default_name;
+};
+
+template<typename FunctorType, typename TagType>
+struct ParallelConstructName<FunctorType, TagType, false> {
+  ParallelConstructName(std::string const& label):label_ref(label) {
+    if (label.empty()) {
+      default_name = std::string(typeid(FunctorType).name());
+    }
+  }
+  std::string const& get() {
+    return (label_ref.empty()) ? default_name : label_ref;
+  }
+  std::string const& label_ref;
+  std::string default_name;
+};
+
+} // namespace Impl
+#endif /* defined KOKKOS_ENABLE_PROFILING */
+
 } // namespace Kokkos

 #endif /* #define KOKKOS_EXECPOLICY_HPP */
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@ -126,14 +126,6 @@ public:
  //! This memory space preferred device_type
  typedef Kokkos::Device< execution_space, memory_space > device_type;

-  /*--------------------------------*/
-  /* Functions unique to the HBWSpace */
-  static int in_parallel();
-
-  static void register_in_parallel( int (*)() );
-
-  /*--------------------------------*/
-
  /**\brief  Default memory space instance */
  HBWSpace();
  HBWSpace( const HBWSpace & rhs ) = default;
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@ -130,14 +130,6 @@ public:
  //! This memory space preferred device_type
  typedef Kokkos::Device< execution_space, memory_space > device_type;

-  /*--------------------------------*/
-  /* Functions unique to the HostSpace */
-  static int in_parallel();
-
-  static void register_in_parallel( int (*)() );
-
-  /*--------------------------------*/
-
  /**\brief  Default memory space instance */
  HostSpace();
  HostSpace( HostSpace && rhs ) = default;
@ -161,7 +153,7 @@ public:
                 , const size_t arg_alloc_size ) const;

  /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return m_name; }

 private:
  AllocationMechanism  m_alloc_mech;
--- a/lib/kokkos/core/src/Kokkos_Layout.hpp
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@ -156,6 +156,8 @@ struct LayoutStride {
      for ( int r = 0 ; r < ARRAY_LAYOUT_MAX_RANK ; ++r ) {
        tmp.dimension[r] = 0 ;
        tmp.stride[r]    = 0 ;
+      }
+      for ( int r = 0 ; r < rank ; ++r ) {
        check_input &= ~int( 1 << order[r] );
      }
      if ( 0 == check_input ) {
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -297,6 +297,10 @@
    #endif
  #endif

+  #if defined( KOKKOS_ARCH_AVX512MIC )
+      #define KOKKOS_ENABLE_RFO_PREFETCH 1
+  #endif 
+
  #if defined( __MIC__ )
    // Compiling for Xeon Phi
  #endif
@ -344,13 +348,18 @@
  //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
  //#define KOKKOS_ENABLE_PRAGMA_SIMD 1

+  #if defined( KOKKOS_ARCH_AVX512MIC )
+      #define KOKKOS_ENABLE_RFO_PREFETCH 1
+  #endif
+
  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
    #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
  #endif

  #if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \
      ( defined( __amd64 ) || defined( __amd64__ ) || \
-        defined( __x86_64 ) || defined( __x86_64__ ) )
+        defined( __x86_64 ) || defined( __x86_64__ ) || \
+	defined(__PPC64__) )
    #define KOKKOS_ENABLE_ASM 1
  #endif
 #endif
--- a/lib/kokkos/core/src/Kokkos_MasterLock.hpp
+++ b/lib/kokkos/core/src/Kokkos_MasterLock.hpp
@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MASTER_LOCK_HPP
+#define KOKKOS_MASTER_LOCK_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos { namespace Experimental {
+
+// my be used to coordinate work between master instances
+// SHOULD NOT be used within a parallel algorithm
+//
+// This lock should be used with with a scoped lock guard
+// i.e. std::unique_lock<Lock>, std::lock_guard
+//
+// cannot be copied or moved
+// has the following functions available
+//
+// Lock()
+// ~Lock()
+//
+// void lock()
+// void unlock()
+// bool try_lock()
+//
+template <typename ExecutionSpace>
+class MasterLock;
+
+}} // namespace Kokkos::Experimental
+
+#endif //KOKKOS_MASTER_LOCK_HPP
+
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@ -66,11 +66,6 @@ private:
  enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
  enum : uint32_t { max_bit_count     = CB::max_bit_count };

-  /*  Defaults for min block, max block, and superblock sizes */
-  enum : uint32_t { MIN_BLOCK_SIZE_LG2  =  6  /*   64 bytes */ };
-  enum : uint32_t { MAX_BLOCK_SIZE_LG2  = 12  /*   4k bytes */ };
-  enum : uint32_t { SUPERBLOCK_SIZE_LG2 = 16  /*  64k bytes */ };
-
  enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };

  /*  Each superblock has a concurrent bitset state
@ -85,6 +80,14 @@ private:
   *  is concurrently updated.
   */

+  /*  Mapping between block_size <-> block_state
+   *
+   *  block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift
+   *  block_size  = m_sb_size_lg2 - ( block_state >> state_shift )
+   *
+   *  Thus A_block_size < B_block_size  <=>  A_block_state > B_block_state
+   */
+
  typedef typename DeviceType::memory_space base_memory_space ;

  enum { accessible =
@ -251,10 +254,10 @@ public:
   *  significant runtime performance improvements.
   */
  MemoryPool( const base_memory_space & memspace
-            , const size_t   min_total_alloc_size
-            , const uint32_t min_block_alloc_size // = 1 << MIN_BLOCK_SIZE_LG2
-            , const uint32_t max_block_alloc_size // = 1 << MAX_BLOCK_SIZE_LG2
-            , const uint32_t min_superblock_size  // = 1 << SUPERBLOCK_SIZE_LG2
+            , const size_t min_total_alloc_size
+            , size_t min_block_alloc_size = 0
+            , size_t max_block_alloc_size = 0
+            , size_t min_superblock_size  = 0
            )
    : m_tracker()
    , m_sb_state_array(0)
@ -267,8 +270,43 @@ public:
    , m_data_offset(0)
    , m_unused_padding(0)
    {
-      const uint32_t int_align_lg2  = 3 ; /* align as int[8] */
-      const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
+      const uint32_t int_align_lg2   = 3 ; /* align as int[8] */
+      const uint32_t int_align_mask  = ( 1u << int_align_lg2 ) - 1 ;
+
+      // Constraints and defaults:
+      //   min_block_alloc_size <= max_block_alloc_size
+      //   max_block_alloc_size <= min_superblock_size 
+      //   min_superblock_size  <= min_total_alloc_size
+
+      const uint32_t MIN_BLOCK_SIZE  = 1u << 6   /*   64 bytes */ ;
+      const uint32_t MAX_BLOCK_SIZE  = 1u << 12  /*   4k bytes */ ;
+
+      if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ;
+
+      if ( 0 == max_block_alloc_size ) {
+
+        max_block_alloc_size = MAX_BLOCK_SIZE ;
+
+        // Upper bound of total allocation size
+        max_block_alloc_size = std::min( size_t(max_block_alloc_size)
+                                       , min_total_alloc_size );
+
+        // Lower bound of minimum block size
+        max_block_alloc_size = std::max( max_block_alloc_size
+                                       , min_block_alloc_size );
+      }
+
+      if ( 0 == min_superblock_size ) {
+        min_superblock_size = max_block_alloc_size ;
+
+        // Upper bound of total allocation size
+        min_superblock_size = std::min( size_t(min_superblock_size)
+                                      , min_total_alloc_size );
+
+        // Lower bound of maximum block size
+        min_superblock_size = std::max( min_superblock_size
+                                      , max_block_alloc_size );
+      }

      // Block and superblock size is power of two:

@ -435,6 +473,8 @@ public:
  void * allocate( size_t alloc_size
                 , int32_t attempt_limit = 1 ) const noexcept
    {
+      if ( 0 == alloc_size ) return (void*) 0 ;
+
      void * p = 0 ;

      const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
@ -444,10 +484,9 @@ public:
        // Allocation will fit within a superblock
        // that has block sizes ( 1 << block_size_lg2 )

-        const uint32_t block_count_lg2  = m_sb_size_lg2 - block_size_lg2 ;
-        const uint32_t block_state      = block_count_lg2 << state_shift ;
-        const uint32_t block_count      = 1u << block_count_lg2 ;
-        const uint32_t block_count_mask = block_count - 1 ;
+        const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
+        const uint32_t block_state     = block_count_lg2 << state_shift ;
+        const uint32_t block_count     = 1u << block_count_lg2 ;

        // Superblock hints for this block size:
        //   hint_sb_id_ptr[0] is the dynamically changing hint
@ -465,7 +504,7 @@ public:
        // the guess for which block within a superblock should
        // be claimed.  If not available then a search occurs.

-        const uint32_t block_id_hint = block_count_mask &
+        const uint32_t block_id_hint =
          (uint32_t)( Kokkos::Impl::clock_tic()
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
          // Spread out potentially concurrent access
@ -474,6 +513,9 @@ public:
 #endif
          );

+        // expected state of superblock for allocation
+        uint32_t sb_state = block_state ;
+
        int32_t sb_id = -1 ;

        volatile uint32_t * sb_state_array = 0 ;
@ -484,6 +526,8 @@ public:

          if ( sb_id < 0 ) {

+            // No superblock specified, try the hint for this block size
+
            sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );

            sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
@ -493,16 +537,20 @@ public:
          //   0 <= sb_id
          //   sb_state_array == m_sb_state_array + m_sb_state_size * sb_id

-          if ( block_state == ( state_header_mask & *sb_state_array ) ) {
+          if ( sb_state == ( state_header_mask & *sb_state_array ) ) {

-            // This superblock state is assigned to this block size.
-            // Try to claim a bit.
+            // This superblock state is as expected, for the moment.
+            // Attempt to claim a bit.  The attempt updates the state
+            // so have already made sure the state header is as expected.
+
+            const uint32_t count_lg2 = sb_state >> state_shift ;
+            const uint32_t mask      = ( 1u << count_lg2 ) - 1 ;

            const Kokkos::pair<int,int> result =
              CB::acquire_bounded_lg2( sb_state_array
-                                     , block_count_lg2
-                                     , block_id_hint
-                                     , block_state
+                                     , count_lg2
+                                     , block_id_hint & mask
+                                     , sb_state
                                     );

            // If result.first < 0 then failed to acquire
@ -512,16 +560,18 @@ public:

            if ( 0 <= result.first ) { // acquired a bit

+              const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
+
              // Set the allocated block pointer

              p = ((char*)( m_sb_state_array + m_data_offset ))
                + ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory
-                + ( result.first    << block_size_lg2 ); // block memory
+                + ( result.first    << size_lg2 );     // block memory

              break ; // Success
            }

-// printf("  acquire block_count_lg2(%d) block_state(0x%x) sb_id(%d) result(%d,%d)\n" , block_count_lg2 , block_state , sb_id , result.first , result.second );
+// printf("  acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second );

          }
          //------------------------------------------------------------------
@ -529,12 +579,18 @@ public:
          //  Must find a new superblock.

          //  Start searching at designated index for this block size.
-          //  Look for a partially full superblock of this block size.
-          //  Look for an empty superblock just in case cannot find partfull.
+          //  Look for superblock that, in preferential order,
+          //  1) part-full superblock of this block size
+          //  2) empty superblock to claim for this block size
+          //  3) part-full superblock of the next larger block size

+          sb_state = block_state ; // Expect to find the desired state
          sb_id = -1 ;

+          bool update_hint = false ;
          int32_t sb_id_empty = -1 ;
+          int32_t sb_id_large = -1 ;
+          uint32_t sb_state_large = 0 ;

          sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;

@ -544,38 +600,54 @@ public:
            //  Note that the state may change at any moment
            //  as concurrent allocations and deallocations occur.
            
-            const uint32_t state = *sb_state_array ;
-            const uint32_t used  = state & state_used_mask ;
+            const uint32_t full_state = *sb_state_array ;
+            const uint32_t used       = full_state & state_used_mask ;
+            const uint32_t state      = full_state & state_header_mask ;

-            if ( block_state == ( state & state_header_mask ) ) {
+            if ( state == block_state ) {

              //  Superblock is assigned to this block size

-              if ( used < block_count ) { 
+              if ( used < block_count ) {

                // There is room to allocate one block

                sb_id = id ;

-                if ( used + 1 < block_count ) {
+                // Is there room to allocate more than one block?

-                  // There is room to allocate more than one block
-
-                  Kokkos::atomic_compare_exchange
-                    ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
-                }
+                update_hint = used + 1 < block_count ;

                break ;
              }
            }
-            else if ( ( used == 0 ) && ( sb_id_empty == -1 ) ) {
+            else if ( 0 == used ) {

-              // Superblock is not assigned to this block size
-              // and is the first empty superblock encountered.
-              // Save this id to use if a partfull superblock is not found.
+              // Superblock is empty

-              sb_id_empty = id ;
+              if ( -1 == sb_id_empty ) {
+
+                // Superblock is not assigned to this block size
+                // and is the first empty superblock encountered.
+                // Save this id to use if a partfull superblock is not found.
+
+                sb_id_empty = id ;
+              }
            }
+            else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
+                      ( -1 == sb_id_large /* have not found a larger */ ) &&
+                      ( state < block_state /* a larger block */ ) &&
+                      // is not full:
+                      ( used < ( 1u << ( state >> state_shift ) ) ) ) {
+              //  First superblock encountered that is
+              //  larger than this block size and
+              //  has room for an allocation.
+              //  Save this id to use of partfull or empty superblock not found
+              sb_id_large    = id ;
+              sb_state_large = state ;
+            }
+
+            // Iterate around the superblock array:

            if ( ++id < m_sb_count ) {
              sb_state_array += m_sb_state_size ;
@ -586,7 +658,7 @@ public:
            }
          }

-// printf("  search m_sb_count(%d) sb_id(%d) sb_id_empty(%d)\n" , m_sb_count , sb_id , sb_id_empty );
+// printf("  search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);

          if ( sb_id < 0 ) {

@ -609,21 +681,31 @@ public:

              const uint32_t state_empty = state_header_mask & *sb_state_array ;

-              if ( state_empty ==
-                     Kokkos::atomic_compare_exchange
-                       (sb_state_array,state_empty,block_state) ) {
+              // If this thread claims the empty block then update the hint
+              update_hint =
+                state_empty ==
+                  Kokkos::atomic_compare_exchange
+                    (sb_state_array,state_empty,block_state);
+            }
+            else if ( 0 <= sb_id_large ) {

-                // If this thread claimed the block then update the hint
+              // Found a larger superblock with space available

-                Kokkos::atomic_compare_exchange
-                  ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
-              }
+              sb_id    = sb_id_large ;
+              sb_state = sb_state_large ;
+
+              sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
            }
            else {
              // Did not find a potentially usable superblock
              --attempt_limit ;
            }
          }
+
+          if ( update_hint ) {
+            Kokkos::atomic_compare_exchange
+              ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
+          }
        } // end allocation attempt loop

        //--------------------------------------------------------------------
@ -646,6 +728,8 @@ public:
  KOKKOS_INLINE_FUNCTION
  void deallocate( void * p , size_t /* alloc_size */ ) const noexcept
    {
+      if ( 0 == p ) return ;
+
      // Determine which superblock and block
      const ptrdiff_t d =
        ((char*)p) - ((char*)( m_sb_state_array + m_data_offset ));
--- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@ -72,11 +72,11 @@ struct MemoryTraits {
  //! Tag this class as a kokkos memory traits:
  typedef MemoryTraits memory_traits ;

-  enum { Unmanaged    = T & unsigned(Kokkos::Unmanaged) };
-  enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
-  enum { Atomic       = T & unsigned(Kokkos::Atomic) };
-  enum { Restrict     = T & unsigned(Kokkos::Restrict) };
-  enum { Aligned      = T & unsigned(Kokkos::Aligned) };
+  enum : bool { Unmanaged    = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))) };
+  enum : bool { RandomAccess = (unsigned(0) != (T & unsigned(Kokkos::RandomAccess))) };
+  enum : bool { Atomic       = (unsigned(0) != (T & unsigned(Kokkos::Atomic))) };
+  enum : bool { Restrict     = (unsigned(0) != (T & unsigned(Kokkos::Restrict))) };
+  enum : bool { Aligned      = (unsigned(0) != (T & unsigned(Kokkos::Aligned))) };

 };

@ -109,7 +109,11 @@ enum { MEMORY_ALIGNMENT =
 #else
    ( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
 #endif
-  , MEMORY_ALIGNMENT_THRESHOLD = 4 
+#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
+  , MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
+#else
+  , MEMORY_ALIGNMENT_THRESHOLD = 4
+#endif
  };


--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@ -47,10 +47,6 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_OPENMP)

-#if !defined(_OPENMP)
-#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
-#endif
-
 #include <Kokkos_Core_fwd.hpp>

 #include <cstddef>
@ -67,95 +63,144 @@
 #include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>

+#include <vector>
+
 /*--------------------------------------------------------------------------*/

 namespace Kokkos {

+namespace Impl {
+class OpenMPExec;
+}
+
 /// \class OpenMP
 /// \brief Kokkos device for multicore processors in the host memory space.
 class OpenMP {
 public:
-  //------------------------------------
-  //! \name Type declarations that all Kokkos devices must provide.
-  //@{
-
  //! Tag this class as a kokkos execution space
  using execution_space = OpenMP;
+
+  using memory_space =
  #ifdef KOKKOS_ENABLE_HBWSPACE
-  using memory_space = Experimental::HBWSpace;
+    Experimental::HBWSpace;
  #else
-  using memory_space = HostSpace;
+    HostSpace;
  #endif
+
  //! This execution space preferred device_type
-  using device_type = Kokkos::Device<execution_space,memory_space>;
-
-  using array_layout = LayoutRight;
-  using size_type = memory_space::size_type;
-
+  using device_type          = Kokkos::Device< execution_space, memory_space >;
+  using array_layout         = LayoutRight;
+  using size_type            = memory_space::size_type;
  using scratch_memory_space = ScratchMemorySpace< OpenMP >;

-  //@}
-  //------------------------------------
-  //! \name Functions that all Kokkos execution spaces must implement.
-  //@{
+  /// \brief Get a handle to the default execution space instance
+  inline
+  OpenMP() noexcept;

-  inline static bool in_parallel();
+  // Using omp_get_max_threads(); is problematic
+  // On Intel (essentially an initial call to the OpenMP runtime
+  // without a parallel region before will set a process mask for a single core
+  // The runtime will than bind threads for a parallel region to other cores on the
+  // entering the first parallel region and make the process mask the aggregate of
+  // the thread masks. The intend seems to be to make serial code run fast, if you
+  // compile with OpenMP enabled but don't actually use parallel regions or so
+  // static int omp_max_threads = omp_get_max_threads();
+  static int get_current_max_threads() noexcept;

-  /** \brief  Set the device in a "sleep" state. A noop for OpenMP.  */
-  static bool sleep();
+  /// \brief Initialize the default execution space
+  ///
+  /// if ( thread_count == -1 )
+  ///   then use the number of threads that openmp defaults to
+  /// if ( thread_count == 0 && Kokkos::hwlow_available() )
+  ///   then use hwloc to choose the number of threads and change
+  ///   the default number of threads
+  /// if ( thread_count > 0 )
+  ///   then force openmp to use the given number of threads and change
+  ///   the default number of threads
+  static void initialize( int thread_count = -1 );

-  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
-  static bool wake();
-
-  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
-  static void fence() {}
-
-  /// \brief Print configuration information to the given output stream.
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  /// \brief Free any resources being consumed by the device.
+  /// \brief Free any resources being consumed by the default execution space
  static void finalize();

-  /** \brief  Initialize the device.
-   *
-   *  1) If the hardware locality library is enabled and OpenMP has not
-   *     already bound threads then bind OpenMP threads to maximize
-   *     core utilization and group for memory hierarchy locality.
-   *
-   *  2) Allocate a HostThread for each OpenMP thread to hold its
-   *     topology and fan in/out data.
-   */
-  static void initialize( unsigned thread_count = 0 ,
-                          unsigned use_numa_count = 0 ,
-                          unsigned use_cores_per_numa = 0 );
+  /// \brief is the default execution space initialized for current 'master' thread
+  static bool is_initialized() noexcept;

-  static int is_initialized();
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool verbose = false );

-  /** \brief  Return the maximum amount of concurrency.  */
-  static int concurrency();
+  /// \brief is the instance running a parallel algorithm
+  inline
+  static bool in_parallel( OpenMP const& = OpenMP() ) noexcept;

-  //@}
-  //------------------------------------
-  /** \brief  This execution space has a topological thread pool which can be queried.
-   *
-   *  All threads within a pool have a common memory space for which they are cache coherent.
-   *    depth = 0  gives the number of threads in the whole pool.
-   *    depth = 1  gives the number of threads in a NUMA region, typically sharing L3 cache.
-   *    depth = 2  gives the number of threads at the finest granularity, typically sharing L1 cache.
-   */
-  inline static int thread_pool_size( int depth = 0 );
+  /// \brief Wait until all dispatched functors complete on the given instance
+  ///
+  ///  This is a no-op on OpenMP
+  inline
+  static void fence( OpenMP const& = OpenMP() ) noexcept;
+
+  /// \brief Does the given instance return immediately after launching
+  /// a parallel algorithm
+  ///
+  /// This always returns false on OpenMP
+  inline
+  static bool is_asynchronous( OpenMP const& = OpenMP() ) noexcept;
+
+
+  /// \brief Partition the default instance into new instances without creating
+  ///  new masters
+  ///
+  /// This is a no-op on OpenMP since the default instance cannot be partitioned
+  /// without promoting other threads to 'master'
+  static std::vector<OpenMP> partition(...);
+
+  /// Non-default instances should be ref-counted so that when the last
+  /// is destroyed the instance resources are released
+  ///
+  /// This is a no-op on OpenMP since a non default instance cannot be created
+  static OpenMP create_instance(...);
+
+  /// \brief Partition the default instance and call 'f' on each new 'master' thread
+  ///
+  /// Func is a functor with the following signiture
+  ///   void( int partition_id, int num_partitions )
+  template <typename F>
+  static void partition_master( F const& f
+                              , int requested_num_partitions = 0
+                              , int requested_partition_size = 0
+                              );
+
+  inline
+  static int thread_pool_size() noexcept;

  /** \brief  The rank of the executing thread in this thread pool */
-  KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
+  KOKKOS_INLINE_FUNCTION
+  static int thread_pool_rank() noexcept;

-  //------------------------------------
+#if !defined( KOKKOS_DISABLE_DEPRECATED )
+  /// \brief Initialize the default execution space
+  static void initialize( int thread_count,
+                          int use_numa_count,
+                          int use_cores_per_numa = 0);

-  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+  inline
+  static int thread_pool_size( int depth );

-  KOKKOS_INLINE_FUNCTION static
-  unsigned hardware_thread_id() { return thread_pool_rank(); }
+  static void sleep() {};
+  static void wake() {};

-  static const char* name();
+  // use UniqueToken
+  static int concurrency();
+
+  // use UniqueToken
+  inline
+  static int max_hardware_threads() noexcept;
+
+  // use UniqueToken
+  KOKKOS_INLINE_FUNCTION
+  static int hardware_thread_id() noexcept;
+#endif
+
+  static constexpr const char* name() noexcept { return "OpenMP"; }
 };

 } // namespace Kokkos
@ -195,6 +240,7 @@ struct VerifyExecutionCanAccessMemorySpace
 /*--------------------------------------------------------------------------*/

 #include <OpenMP/Kokkos_OpenMP_Exec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Team.hpp>
 #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMP_Task.hpp>

--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@ -177,22 +177,23 @@ void parallel_for( const ExecPolicy  & policy
                 )
 {
 #if defined(KOKKOS_ENABLE_PROFILING)
-    uint64_t kpID = 0;
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-     	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
+  uint64_t kpID = 0;
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecPolicy::work_tag> name(str);
+    Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID);
+  }
 #endif

-    Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+    Kokkos::Impl::shared_allocation_tracking_disable();
    Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy );
-    Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+    Kokkos::Impl::shared_allocation_tracking_enable();

   closure.execute();

 #if defined(KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Profiling::endParallelFor(kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::endParallelFor(kpID);
+  }
 #endif
 }

@ -210,14 +211,15 @@ void parallel_for( const size_t        work_count

 #if defined(KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-  	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str);
+    Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID);
+  }
 #endif

-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+  Kokkos::Impl::shared_allocation_tracking_disable();
  Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+  Kokkos::Impl::shared_allocation_tracking_enable();

  closure.execute();

@ -420,21 +422,22 @@ void parallel_scan( const ExecutionPolicy & policy
 {
 #if defined(KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecutionPolicy::work_tag> name(str);
+    Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID);
+  }
 #endif

-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+  Kokkos::Impl::shared_allocation_tracking_disable();
  Impl::ParallelScan< FunctorType , ExecutionPolicy > closure( functor , policy );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+  Kokkos::Impl::shared_allocation_tracking_enable();

  closure.execute();

 #if defined(KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Profiling::endParallelScan(kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::endParallelScan(kpID);
+  }
 #endif

 }
@ -453,21 +456,22 @@ void parallel_scan( const size_t        work_count

 #if defined(KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str);
+    Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID);
+  }
 #endif

-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+  Kokkos::Impl::shared_allocation_tracking_disable();
  Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+  Kokkos::Impl::shared_allocation_tracking_enable();

  closure.execute();

 #if defined(KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Profiling::endParallelScan(kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::endParallelScan(kpID);
+  }
 #endif

 }
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@ -872,13 +872,14 @@ namespace Impl {
        const FunctorType& functor,
        ReturnType& return_value) {
          #if defined(KOKKOS_ENABLE_PROFILING)
-            uint64_t kpID = 0;
-            if(Kokkos::Profiling::profileLibraryLoaded()) {
-              Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
-            }
+          uint64_t kpID = 0;
+          if(Kokkos::Profiling::profileLibraryLoaded()) {
+            Kokkos::Impl::ParallelConstructName<FunctorType, typename PolicyType::work_tag> name(label);
+            Kokkos::Profiling::beginParallelReduce(name.get(), 0, &kpID);
+          }
          #endif

-          Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+          Kokkos::Impl::shared_allocation_tracking_disable();
          #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
          Impl::ParallelReduce<typename functor_adaptor::functor_type, PolicyType, typename return_value_adapter::reducer_type >
             closure(functor_adaptor::functor(functor),
@ -890,13 +891,13 @@ namespace Impl {
                     policy,
                     return_value_adapter::return_value(return_value,functor));
          #endif
-          Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+          Kokkos::Impl::shared_allocation_tracking_enable();
          closure.execute();

          #if defined(KOKKOS_ENABLE_PROFILING)
-            if(Kokkos::Profiling::profileLibraryLoaded()) {
-              Kokkos::Profiling::endParallelReduce(kpID);
-            }
+          if(Kokkos::Profiling::profileLibraryLoaded()) {
+            Kokkos::Profiling::endParallelReduce(kpID);
+          }
          #endif
        }

--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -66,6 +66,7 @@

 #include <KokkosExp_MDRangePolicy.hpp>

+#include <Kokkos_UniqueToken.hpp>

 namespace Kokkos {

@ -526,6 +527,7 @@ public:
    }
 };

+
 /*--------------------------------------------------------------------------*/

 template< class FunctorType , class ... Traits >
@ -604,6 +606,178 @@ public:
    {}
 };

+} // namespace Impl
+} // namespace Kokkos
+
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Parallel patterns for Kokkos::Serial with MDRangePolicy */
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::MDRangePolicy< Traits ... > ,
+                   Kokkos::Serial
+                 >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy Policy ;
+
+  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;
+
+  void
+  exec() const
+    {
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        iterate_type( m_mdr_policy, m_functor )( i );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    { this->exec(); }
+
+  inline
+  ParallelFor( const FunctorType   & arg_functor
+             , const MDRangePolicy & arg_policy )
+    : m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    {}
+};
+
+
+template< class FunctorType , class ReducerType , class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Serial
+                    >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy Policy ;
+
+  typedef typename MDRangePolicy::work_tag                                  WorkTag ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef typename ReducerTypeFwd::value_type ValueType; 
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+                                                                           , FunctorType
+                                                                           , WorkTag
+                                                                           , ValueType
+                                                                           >;
+
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  inline
+  void
+  exec( reference_type update ) const
+    {
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        iterate_type( m_mdr_policy, m_functor, update )( i );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> exec( update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const MDRangePolicy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.data() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Serial reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Serial reduce result must be a View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+};
+
+
+
 } // namespace Impl
 } // namespace Kokkos

@ -819,6 +993,60 @@ public:
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/

+namespace Kokkos { namespace Experimental {
+
+template<>
+class UniqueToken< Serial, UniqueTokenScope::Instance>
+{
+public:
+  using execution_space = Serial;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return 1; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return 0; }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+template<>
+class UniqueToken< Serial, UniqueTokenScope::Global>
+{
+public:
+  using execution_space = Serial;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return 1; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return 0; }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+}} // namespace Kokkos::Experimental
+
 #include <impl/Kokkos_Serial_Task.hpp>

 #endif // defined( KOKKOS_ENABLE_SERIAL )
--- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
@ -148,7 +148,7 @@ private:
    typename std::conditional< Arg2_is_space , Arg2 , void
    >::type >::type ;

-  using task_base  = Impl::TaskBase< Space , ValueType , void > ;
+  using task_base  = Impl::TaskBase< void , void , void > ;
  using queue_type = Impl::TaskQueue< Space > ;

  task_base * m_task ;
@ -293,13 +293,17 @@ public:
  //----------------------------------------

  KOKKOS_INLINE_FUNCTION
-  typename task_base::get_return_type
+  int is_ready() const noexcept
+    { return ( 0 == m_task ) || ( ((task_base*) task_base::LockTag) == m_task->m_wait ); }
+
+  KOKKOS_INLINE_FUNCTION
+  const typename Impl::TaskResult< ValueType >::reference_type
  get() const
    {
      if ( 0 == m_task ) {
        Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
      }
-      return m_task->get();
+      return Impl::TaskResult< ValueType >::get( m_task );
    }
 };

@ -396,7 +400,7 @@ private:

  using track_type = Kokkos::Impl::SharedAllocationTracker ;
  using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
-  using task_base  = Impl::TaskBase< ExecSpace , void , void > ;
+  using task_base  = Impl::TaskBase< void , void , void > ;

  track_type   m_track ;
  queue_type * m_queue ;
@ -464,29 +468,19 @@ public:

  KOKKOS_INLINE_FUNCTION
  memory_pool * memory() const noexcept
-    { return m_queue ? m_queue->m_memory : (memory_pool*) 0 ; }
+    { return m_queue ? &( m_queue->m_memory ) : (memory_pool*) 0 ; }

  //----------------------------------------
  /**\brief  Allocation size for a spawned task */
  template< typename FunctorType >
  KOKKOS_FUNCTION
  size_t spawn_allocation_size() const
-    {
-      using task_type  = Impl::TaskBase< execution_space
-                                       , typename FunctorType::value_type
-                                       , FunctorType > ;
-
-      return m_queue->allocate_block_size( sizeof(task_type) );
-    }
+    { return m_queue->template spawn_allocation_size< FunctorType >(); }

  /**\brief  Allocation size for a when_all aggregate */
  KOKKOS_FUNCTION
  size_t when_all_allocation_size( int narg ) const
-    {
-      using task_base  = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
-
-      return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
-    }
+    { return m_queue->when_all_allocation_size( narg ); }

  //----------------------------------------

@ -507,7 +501,7 @@ public:
      queue_type * const queue =
        arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : (
        arg_policy.m_dependence.m_task
-          ? arg_policy.m_dependence.m_task->m_queue
+          ? static_cast<queue_type*>(arg_policy.m_dependence.m_task->m_queue)
          : (queue_type*) 0 );

      if ( 0 == queue ) {
@ -530,8 +524,12 @@ public:
      future_type f ;

      // Allocate task from memory pool
+
+      const size_t alloc_size =
+        queue->template spawn_allocation_size< FunctorType >();
+
      f.m_task =
-        reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type)));
+        reinterpret_cast< task_type * >(queue->allocate(alloc_size) );

      if ( f.m_task ) {

@ -539,15 +537,17 @@ public:
        // Reference count starts at two:
        //   +1 for the matching decrement when task is complete
        //   +1 for the future
-        new ( f.m_task )
-          task_type( arg_function
-                   , queue
-                   , arg_policy.m_dependence.m_task /* dependence */
-                   , 2                              /* reference count */
-                   , int(sizeof(task_type))         /* allocation size */
-                   , int(arg_policy.m_task_type)
-                   , int(arg_policy.m_priority)
-                   , std::move(arg_functor) );
+        new ( f.m_task ) task_type( std::move(arg_functor) );
+
+        f.m_task->m_apply      = arg_function ;
+        f.m_task->m_queue      = queue ;
+        f.m_task->m_next       = arg_policy.m_dependence.m_task ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = alloc_size ;
+        f.m_task->m_task_type  = arg_policy.m_task_type ;
+        f.m_task->m_priority   = arg_policy.m_priority ;
+
+        Kokkos::memory_fence();

        // The dependence (if any) is processed immediately
        // within the schedule function, as such the dependence's
@ -586,6 +586,30 @@ public:
      // Postcondition: task is in Executing-Respawn state
    }

+  template< typename FunctorType >
+  KOKKOS_FUNCTION static
+  void
+  respawn( FunctorType         * arg_self
+         , TaskScheduler const &
+         , TaskPriority  const & arg_priority
+         )
+    {
+      // Precondition: task is in Executing state
+
+      using value_type  = typename FunctorType::value_type ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      task_type * const task = static_cast< task_type * >( arg_self );
+
+      task->m_priority = static_cast<int>(arg_priority);
+
+      task->add_dependence( (task_base*) 0 );
+
+      // Postcondition: task is in Executing-Respawn state
+    }
+
  //----------------------------------------
  /**\brief  Return a future that is complete
   *         when all input futures are complete.
@ -596,7 +620,7 @@ public:
  when_all( Future< A1 , A2 > const arg[] , int narg )
    {
      using future_type = Future< execution_space > ;
-      using task_base   = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+      using task_base   = Kokkos::Impl::TaskBase< void , void , void > ;

      future_type f ;

@ -610,9 +634,9 @@ public:
            // Increment reference count to track subsequent assignment.
            Kokkos::atomic_increment( &(t->m_ref_count) );
            if ( queue == 0 ) {
-              queue = t->m_queue ;
+              queue = static_cast< queue_type * >( t->m_queue );
            }
-            else if ( queue != t->m_queue ) {
+            else if ( queue != static_cast< queue_type * >( t->m_queue ) ) {
              Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
            }
          }
@ -620,28 +644,34 @@ public:

        if ( queue != 0 ) {

-          size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);
+          size_t const alloc_size = queue->when_all_allocation_size( narg );

          f.m_task =
-            reinterpret_cast< task_base * >( queue->allocate( size ) );
+            reinterpret_cast< task_base * >( queue->allocate( alloc_size ) );

          if ( f.m_task ) {

            // Reference count starts at two:
            // +1 to match decrement when task completes
            // +1 for the future
-            new( f.m_task ) task_base( queue
-                                     , 2     /* reference count */
-                                     , size  /* allocation size */
-                                     , narg  /* dependence count */
-                                     );
+
+            new( f.m_task ) task_base();
+
+            f.m_task->m_queue      = queue ;
+            f.m_task->m_ref_count  = 2 ;
+            f.m_task->m_alloc_size = alloc_size ;
+            f.m_task->m_dep_count  = narg ;
+            f.m_task->m_task_type  = task_base::Aggregate ;

            // Assign dependences, reference counts were already incremented

-            task_base ** const dep = f.m_task->aggregate_dependences();
+            task_base * volatile * const dep =
+              f.m_task->aggregate_dependences();

            for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; }

+            Kokkos::memory_fence();
+
            queue->schedule_aggregate( f.m_task );
            // this when_all may be processed at any moment
          }
--- a/lib/kokkos/core/src/Kokkos_UniqueToken.hpp
+++ b/lib/kokkos/core/src/Kokkos_UniqueToken.hpp
@ -0,0 +1,88 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_UNIQUE_TOKEN_HPP
+#define KOKKOS_UNIQUE_TOKEN_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos { namespace Experimental {
+
+enum class UniqueTokenScope : int
+{
+  Instance,
+  Global
+};
+
+/// \brief class to generate unique ids base on the required amount of concurrency
+///
+/// This object should behave like a ref-counted object, so that when the last
+/// instance is destroy resources are free if needed
+template <typename ExecutionSpace, UniqueTokenScope = UniqueTokenScope::Instance >
+class UniqueToken
+{
+public:
+  using execution_space = ExecutionSpace;
+  using size_type       = typename execution_space::size_type;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() );
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  size_type size() const ;
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  size_type acquire() const ;
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release( size_type ) const ;
+};
+
+}} // namespace Kokkos::Experimental
+
+#endif //KOKKOS_UNIQUE_TOKEN_HPP
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -54,11 +54,14 @@
 #include <Kokkos_MemoryTraits.hpp>
 #include <Kokkos_ExecPolicy.hpp>

+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#endif
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

 template< class DataType >
@ -73,16 +76,6 @@ struct ViewDataAnalysis ;
 template< class , class ... >
 class ViewMapping { public: enum { is_assignable = false }; };

-} /* namespace Impl */
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-namespace Impl {
-
-using Kokkos::Experimental::Impl::ViewMapping ;
-using Kokkos::Experimental::Impl::ViewDataAnalysis ;
-
 } /* namespace Impl */
 } /* namespace Kokkos */

@ -1563,12 +1556,12 @@ namespace Kokkos {
 namespace Impl {

 inline
-void shared_allocation_tracking_claim_and_disable()
-{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_claim_and_disable(); }
+void shared_allocation_tracking_disable()
+{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_disable(); }

 inline
-void shared_allocation_tracking_release_and_enable()
-{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_release_and_enable(); }
+void shared_allocation_tracking_enable()
+{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_enable(); }

 } /* namespace Impl */
 } /* namespace Kokkos */
@ -1795,6 +1788,20 @@ void deep_copy

  if ( (void *) dst.data() != (void*) src.data() ) {

+#if defined(KOKKOS_ENABLE_PROFILING)
+    if (Kokkos::Profiling::profileLibraryLoaded()) {
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+      Kokkos::Profiling::beginDeepCopy(
+          Kokkos::Profiling::SpaceHandle(dst_memory_space::name()),
+          dst.label(),
+          dst.data(),
+          Kokkos::Profiling::SpaceHandle(src_memory_space::name()),
+          src.label(),
+          src.data(),
+          nbytes);
+    }
+#endif
+
    // Concern: If overlapping views then a parallel copy will be erroneous.
    // ...

@ -1882,7 +1889,14 @@ void deep_copy
    else {
      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
    }
-  }
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+    if (Kokkos::Profiling::profileLibraryLoaded()) {
+      Kokkos::Profiling::endDeepCopy();
+    }
+#endif
+
+  } // ( (void *) dst.data() != (void*) src.data() )
 }

 } /* namespace Kokkos */
@ -2249,6 +2263,82 @@ resize( Kokkos::View<T,P...> & v ,

  static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );

+  // Fix #904 by checking dimensions before actually resizing.
+  //
+  // Rank is known at compile time, so hopefully the compiler will
+  // remove branches that are compile-time false.  The upcoming "if
+  // constexpr" language feature would make this certain.
+  if (view_type::Rank == 1 &&
+      n0 == static_cast<size_t> (v.extent(0))) {
+    return;
+  }
+  if (view_type::Rank == 2 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1))) {
+    return;
+  }
+  if (view_type::Rank == 3 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2))) {
+    return;
+  }
+  if (view_type::Rank == 4 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3))) {
+    return;
+  }
+  if (view_type::Rank == 5 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4))) {
+    return;
+  }
+  if (view_type::Rank == 6 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4)) &&
+      n5 == static_cast<size_t> (v.extent(5))) {
+    return;
+  }
+  if (view_type::Rank == 7 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4)) &&
+      n5 == static_cast<size_t> (v.extent(5)) &&
+      n6 == static_cast<size_t> (v.extent(6))) {
+    return;
+  }
+  if (view_type::Rank == 8 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4)) &&
+      n5 == static_cast<size_t> (v.extent(5)) &&
+      n6 == static_cast<size_t> (v.extent(6)) &&
+      n7 == static_cast<size_t> (v.extent(7))) {
+    return;
+  }
+  // If Kokkos ever supports Views of rank > 8, the above code won't
+  // be incorrect, because avoiding reallocation in resize() is just
+  // an optimization.
+
+  // TODO (mfh 27 Jun 2017) If the old View has enough space but just
+  // different dimensions (e.g., if the product of the dimensions,
+  // including extra space for alignment, will not change), then
+  // consider just reusing storage.  For now, Kokkos always
+  // reallocates if any of the dimensions change, even if the old View
+  // has enough space.
+
  view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 );

  Kokkos::Impl::ViewRemap< view_type , view_type >( v_resized , v );
@ -2317,6 +2407,106 @@ void realloc(      Kokkos::View<T,P...> & v ,
 }
 } /* namespace Kokkos */

+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Impl {
+
+template < class Specialize, typename A, typename B >
+struct CommonViewValueType;
+
+template < typename A, typename B >
+struct CommonViewValueType< void, A, B >
+{
+  using value_type = typename std::common_type< A , B >::type;
+};
+
+
+template < class Specialize, class ValueType >
+struct CommonViewAllocProp;
+
+template < class ValueType >
+struct CommonViewAllocProp< void, ValueType >
+{
+  using value_type = ValueType;
+
+  template < class ... Views >
+  CommonViewAllocProp( const Views & ... ) {}
+};
+
+
+template < class ... Views >
+struct DeduceCommonViewAllocProp;
+
+// Base case must provide types for:
+// 1. specialize  2. value_type  3. is_view  4. prop_type
+template < class FirstView >
+struct DeduceCommonViewAllocProp< FirstView >
+{
+  using specialize = typename FirstView::traits::specialize;
+
+  using value_type = typename FirstView::traits::value_type;
+
+  enum : bool { is_view = is_view< FirstView >::value };
+
+  using prop_type = CommonViewAllocProp< specialize, value_type >;
+};
+
+
+template < class FirstView, class ... NextViews >
+struct DeduceCommonViewAllocProp< FirstView, NextViews... >
+{
+  using NextTraits = DeduceCommonViewAllocProp< NextViews... >;
+
+  using first_specialize = typename FirstView::traits::specialize;
+  using first_value_type = typename FirstView::traits::value_type;
+
+  enum : bool { first_is_view = is_view< FirstView >::value };
+
+  using next_specialize = typename NextTraits::specialize;
+  using next_value_type = typename NextTraits::value_type;
+
+  enum : bool { next_is_view = NextTraits::is_view };
+
+  // common types
+
+  // determine specialize type
+  // if first and next specialize differ, but are not the same specialize, error out
+  static_assert( !(!std::is_same< first_specialize, next_specialize >::value && !std::is_same< first_specialize, void>::value && !std::is_same< void, next_specialize >::value)  , "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void specialize trait allowed" );
+
+  // otherwise choose non-void specialize if either/both are non-void
+  using specialize = typename std::conditional< std::is_same< first_specialize, next_specialize >::value
+                                              , first_specialize
+                                              , typename std::conditional< ( std::is_same< first_specialize, void >::value
+                                                                             && !std::is_same< next_specialize, void >::value)
+                                                                           , next_specialize
+                                                                           , first_specialize
+                                                                         >::type
+                                               >::type;
+
+  using value_type = typename CommonViewValueType< specialize, first_value_type, next_value_type >::value_type;
+
+  enum : bool { is_view = (first_is_view && next_is_view) };
+
+  using prop_type = CommonViewAllocProp< specialize, value_type >;
+};
+
+} // end namespace Impl
+
+template < class ... Views >
+using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...>::prop_type ;
+
+// User function
+template < class ... Views >
+DeducedCommonPropsType<Views...> 
+common_view_alloc_prop( Views const & ... views )
+{
+  return DeducedCommonPropsType<Views...>( views... );
+}
+
+} // namespace Kokkos
+
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // For backward compatibility:
@ -2350,6 +2540,9 @@ using Kokkos::Impl::WithoutInitializing_t ;
 using Kokkos::Impl::AllowPadding_t ;
 using Kokkos::Impl::SharedAllocationRecord ;
 using Kokkos::Impl::SharedAllocationTracker ;
+using Kokkos::Impl::ViewMapping ;
+using Kokkos::Impl::ViewDataAnalysis ;
+

 } /* namespace Impl */
 } /* namespace Experimental */
--- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@ -0,0 +1,265 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_WORKGRAPHPOLICY_HPP
+#define KOKKOS_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+namespace Experimental {
+
+template< class functor_type , class execution_space, class ... policy_args >
+class WorkGraphExec;
+
+}}} // namespace Kokkos::Impl::Experimental
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class ... Properties >
+class WorkGraphPolicy
+{
+public:
+
+  using self_type = WorkGraphPolicy<Properties ... >;
+  using traits = Kokkos::Impl::PolicyTraits<Properties ... >;
+  using index_type = typename traits::index_type;
+  using execution_space = typename traits::execution_space;
+  using work_tag = typename traits::work_tag;
+  using memory_space = typename execution_space::memory_space;
+  using graph_type = Kokkos::Experimental::Crs<index_type, execution_space, void, index_type>;
+  using member_type = index_type;
+
+private:
+   
+  graph_type m_graph;
+
+  using ints_type = Kokkos::View<std::int32_t*, memory_space>;
+  using range_type = Kokkos::pair<std::int32_t, std::int32_t>;
+  using ranges_type = Kokkos::View<range_type*, memory_space>;
+  const std::int32_t m_total_work;
+  ints_type m_counts;
+  ints_type m_queue;
+  ranges_type m_ranges;
+
+public:
+
+  struct TagZeroRanges {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagZeroRanges, std::int32_t i) const {
+    m_ranges[i] = range_type(0, 0);
+  }
+  void zero_ranges() {
+    using policy_type = RangePolicy<std::int32_t, execution_space, TagZeroRanges>;
+    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+    const closure_type closure(*this, policy_type(0, 1));
+    closure.execute();
+    execution_space::fence();
+  }
+
+  struct TagFillQueue {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFillQueue, std::int32_t i) const {
+    if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i);
+  }
+  void fill_queue() {
+    using policy_type = RangePolicy<std::int32_t, execution_space, TagFillQueue>;
+    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+    const closure_type closure(*this, policy_type(0, m_total_work));
+    closure.execute();
+    execution_space::fence();
+  }
+
+private:
+
+  inline
+  void setup() {
+    if (m_graph.numRows() > std::numeric_limits<std::int32_t>::max()) {
+      Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t");
+    }
+    get_crs_transpose_counts(m_counts, m_graph);
+    m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work);
+    deep_copy(m_queue, std::int32_t(-1));
+    m_ranges = ranges_type("ranges", 1);
+    fill_queue();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  std::int32_t pop_work() const {
+    range_type w(-1,-1);
+    while (true) {
+      const range_type w_new( w.first + 1 , w.second );
+      w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
+      if ( w.first < w.second ) { // there was work in the queue
+        if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
+          // we got a work item
+          std::int32_t i;
+          // the push_work function may have incremented the end counter
+          // but not yet written the work index into the queue.
+          // wait until the entry is valid.
+          while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) );
+          return i;
+        } // we got a work item
+      } else { // there was no work in the queue
+#ifdef KOKKOS_DEBUG
+        if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
+          Kokkos::abort("bug in pop_work");
+        }
+#endif
+        if (w.first == m_total_work) { // all work is done
+          return -1;
+        } else { // need to wait for more work to be pushed
+          // take a guess that one work item will be pushed
+          // the key thing is we can't leave (w) alone, because
+          // otherwise the next compare_exchange may succeed in
+          // popping work from an empty queue
+          w.second++;
+        }
+      } // there was no work in the queue
+    } // while (true)
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void push_work(std::int32_t i) const {
+    range_type w(-1,-1);
+    while (true) {
+      const range_type w_new( w.first , w.second + 1 );
+      // try to increment the end counter
+      w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
+      // stop trying if the increment was successful
+      if ( w.first == w_new.first && w.second + 1 == w_new.second ) break;
+    }
+    // write the work index into the claimed spot in the queue
+    *((volatile std::int32_t*)(&m_queue( w.second ))) = i;
+    // push this write out into the memory system
+    memory_fence();
+  }
+
+  template< class functor_type , class execution_space, class ... policy_args >
+  friend class Kokkos::Impl::Experimental::WorkGraphExec;
+
+public:
+
+  WorkGraphPolicy(graph_type arg_graph)
+    : m_graph(arg_graph)
+    , m_total_work( arg_graph.numRows() )
+  {
+    setup();
+  }
+
+};
+
+}} // namespace Kokkos::Experimental
+
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace Experimental {
+
+template< class functor_type , class execution_space, class ... policy_args >
+class WorkGraphExec
+{
+ public:
+
+  using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >;
+  using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >;
+  using member_type = typename policy_type::member_type;
+  using memory_space = typename execution_space::memory_space;
+
+ protected:
+
+  const functor_type m_functor;
+  const policy_type  m_policy;
+
+ protected:
+
+  KOKKOS_INLINE_FUNCTION
+  std::int32_t before_work() const {
+    return m_policy.pop_work();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void after_work(std::int32_t i) const {
+    /* fence any writes that were done by the work item itself
+       (usually writing its result to global memory) */
+    memory_fence();
+    const std::int32_t begin = m_policy.m_graph.row_map( i );
+    const std::int32_t end = m_policy.m_graph.row_map( i + 1 );
+    for (std::int32_t j = begin; j < end; ++j) {
+      const std::int32_t next = m_policy.m_graph.entries( j );
+      const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 );
+      if ( old_count == 1 )  m_policy.push_work( next );
+    }
+  }
+
+  inline
+  WorkGraphExec( const functor_type & arg_functor
+               , const policy_type  & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+  {
+  }
+};
+
+}}} // namespace Kokkos::Impl::Experimental
+
+#ifdef KOKKOS_ENABLE_SERIAL
+#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+#include "OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp"
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+#include "Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp"
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+#include "Threads/Kokkos_Threads_WorkGraphPolicy.hpp"
+#endif
+
+#endif /* #define KOKKOS_WORKGRAPHPOLICY_HPP */
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@ -45,75 +45,100 @@
 #if defined( KOKKOS_ENABLE_OPENMP )

 #include <cstdio>
+#include <cstdlib>
+
 #include <limits>
 #include <iostream>
 #include <vector>
+
 #include <Kokkos_Core.hpp>
+
 #include <impl/Kokkos_Error.hpp>
-#include <iostream>
 #include <impl/Kokkos_CPUDiscovery.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>


 namespace Kokkos {
 namespace Impl {
-namespace {

-KOKKOS_INLINE_FUNCTION
-int kokkos_omp_in_parallel();
+int g_openmp_hardware_max_threads = 1;

-int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
+__thread int t_openmp_hardware_id = 0;
+__thread Impl::OpenMPExec * t_openmp_instance = nullptr;

-KOKKOS_INLINE_FUNCTION
-int kokkos_omp_in_parallel()
+void OpenMPExec::validate_partition( const int nthreads
+                                   , int & num_partitions
+                                   , int & partition_size
+                                  )
 {
-#ifndef __CUDA_ARCH__
-  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
-#else
-  return 0;
-#endif
+  if (nthreads == 1) {
+    num_partitions = 1;
+    partition_size = 1;
+  }
+  else if( num_partitions < 1 && partition_size < 1) {
+    int idle = nthreads;
+    for (int np = 2; np <= nthreads ; ++np) {
+      for (int ps = 1; ps <= nthreads/np; ++ps) {
+        if (nthreads - np*ps < idle) {
+          idle = nthreads - np*ps;
+          num_partitions = np;
+          partition_size = ps;
+        }
+        if (idle == 0) {
+          break;
+        }
+      }
+    }
+  }
+  else if( num_partitions < 1 && partition_size > 0 ) {
+    if ( partition_size <= nthreads ) {
+      num_partitions = nthreads / partition_size;
+    }
+    else {
+      num_partitions = 1;
+      partition_size = nthreads;
+    }
+  }
+  else if( num_partitions > 0 && partition_size < 1 ) {
+    if ( num_partitions <= nthreads ) {
+      partition_size = nthreads / num_partitions;
+    }
+    else {
+      num_partitions = nthreads;
+      partition_size = 1;
+    }
+  }
+  else if ( num_partitions * partition_size > nthreads ) {
+    int idle = nthreads;
+    const int NP = num_partitions;
+    const int PS = partition_size;
+    for (int np = NP; np > 0; --np) {
+      for (int ps = PS; ps > 0; --ps) {
+        if (  (np*ps <= nthreads)
+           && (nthreads - np*ps < idle) ) {
+          idle = nthreads - np*ps;
+          num_partitions = np;
+          partition_size = ps;
+        }
+        if (idle == 0) {
+          break;
+        }
+      }
+    }
+  }
+
 }

-bool s_using_hwloc = false;
-
-} // namespace
-} // namespace Impl
-} // namespace Kokkos
-
-
-namespace Kokkos {
-namespace Impl {
-
-int OpenMPExec::m_map_rank[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
-
-int OpenMPExec::m_pool_topo[ 4 ] = { 0 };
-
-HostThreadTeamData * OpenMPExec::m_pool[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
-
-void OpenMPExec::verify_is_process( const char * const label )
+void OpenMPExec::verify_is_master( const char * const label )
 {
-  if ( omp_in_parallel() ) {
+  if ( !t_openmp_instance )
+  {
    std::string msg( label );
-    msg.append( " ERROR: in parallel" );
+    msg.append( " ERROR: in parallel or not initialized" );
    Kokkos::Impl::throw_runtime_exception( msg );
  }
 }

-void OpenMPExec::verify_initialized( const char * const label )
-{
-  if ( 0 == m_pool[0] ) {
-    std::string msg( label );
-    msg.append( " ERROR: not initialized" );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-
-  if ( omp_get_max_threads() != Kokkos::OpenMP::thread_pool_size(0) ) {
-    std::string msg( label );
-    msg.append( " ERROR: Initialized but threads modified inappropriately" );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-
-}

 } // namespace Impl
 } // namespace Kokkos
@ -133,11 +158,11 @@ void OpenMPExec::clear_thread_data()
  const int old_alloc_bytes =
    m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;

-  Kokkos::HostSpace space ;
+  OpenMP::memory_space space ;

-#pragma omp parallel
+  #pragma omp parallel num_threads( m_pool_size )
  {
-    const int rank = m_map_rank[ omp_get_thread_num() ];
+    const int rank = omp_get_thread_num();

    if ( 0 != m_pool[rank] ) {

@ -189,13 +214,13 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
                                      , team_shared_bytes
                                      , thread_local_bytes );

-    const int pool_size = omp_get_max_threads();
+    OpenMP::memory_space space ;

-    Kokkos::HostSpace space ;
+    memory_fence();

-#pragma omp parallel
+    #pragma omp parallel num_threads(m_pool_size)
    {
-      const int rank = m_map_rank[ omp_get_thread_num() ];
+      const int rank = omp_get_thread_num();

      if ( 0 != m_pool[rank] ) {

@ -214,11 +239,14 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
                      , pool_reduce_bytes
                      , team_reduce_bytes
                      , team_shared_bytes
-                      , thread_local_bytes );
+                      , thread_local_bytes
+                      );
+
+      memory_fence();
    }
 /* END #pragma omp parallel */

-    HostThreadTeamData::organize_pool( m_pool , pool_size );
+    HostThreadTeamData::organize_pool( m_pool , m_pool_size );
  }
 }

@ -232,16 +260,8 @@ namespace Kokkos {

 //----------------------------------------------------------------------------

-int OpenMP::is_initialized()
-{ return 0 != Impl::OpenMPExec::m_pool[0]; }
-
-void OpenMP::initialize( unsigned thread_count ,
-                         unsigned use_numa_count ,
-                         unsigned use_cores_per_numa )
+int OpenMP::get_current_max_threads() noexcept
 {
-  // Before any other call to OMP query the maximum number of threads
-  // and save the value for re-initialization unit testing.
-
  // Using omp_get_max_threads(); is problematic in conjunction with
  // Hwloc on Intel (essentially an initial call to the OpenMP runtime
  // without a parallel region before will set a process mask for a single core
@ -250,110 +270,99 @@ void OpenMP::initialize( unsigned thread_count ,
  // the thread masks. The intend seems to be to make serial code run fast, if you
  // compile with OpenMP enabled but don't actually use parallel regions or so
  // static int omp_max_threads = omp_get_max_threads();
-  int nthreads = 0;
+
+  int count = 0;
  #pragma omp parallel
  {
    #pragma omp atomic
-    nthreads++;
+     ++count;
  }
+  return count;
+}

-  static int omp_max_threads = nthreads;
-
-  const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
-
-  bool thread_spawn_failed = false ;
-
-  if ( ! is_initialized ) {
-
-    // Use hwloc thread pinning if concerned with locality.
-    // If spreading threads across multiple NUMA regions.
-    // If hyperthreading is enabled.
-    Impl::s_using_hwloc = hwloc::available() && (
-                            ( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
-                            ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
-
-    std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPExec::MAX_THREAD_COUNT ];
-
-    // If hwloc available then use it's maximum value.
-
-    if ( thread_count == 0 ) {
-      thread_count = Impl::s_using_hwloc
-      ? Kokkos::hwloc::get_available_numa_count() *
-        Kokkos::hwloc::get_available_cores_per_numa() *
-        Kokkos::hwloc::get_available_threads_per_core()
-      : omp_max_threads ;
-    }
-
-    if(Impl::s_using_hwloc)
-      hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
-                           false /* do not allow asynchronous */ ,
-                           thread_count ,
-                           use_numa_count ,
-                           use_cores_per_numa ,
-                           threads_coord );
-
-    // Spawn threads:
-
-    omp_set_num_threads( thread_count );
-
-    // Verify OMP interaction:
-    if ( int(thread_count) != omp_get_max_threads() ) {
-      thread_spawn_failed = true ;
-    }
-
-    // Verify spawning and bind threads:
-#pragma omp parallel
-    {
-#pragma omp critical
-      {
-        if ( int(thread_count) != omp_get_num_threads() ) {
-          thread_spawn_failed = true ;
-        }
-
-        // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
-        // Call to 'new' may not be thread safe as well.
-
-        const unsigned omp_rank    = omp_get_thread_num();
-        const unsigned thread_r    = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
-                                   ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
-                                   : omp_rank ;
-
-        Impl::OpenMPExec::m_map_rank[ omp_rank ] = thread_r ;
-      }
-/* END #pragma omp critical */
-    }
-/* END #pragma omp parallel */
-
-    if ( ! thread_spawn_failed ) {
-      Impl::OpenMPExec::m_pool_topo[0] = thread_count ;
-      Impl::OpenMPExec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
-      Impl::OpenMPExec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
-
-      // New, unified host thread team data:
-      {
-        size_t pool_reduce_bytes  =   32 * thread_count ;
-        size_t team_reduce_bytes  =   32 * thread_count ;
-        size_t team_shared_bytes  = 1024 * thread_count ;
-        size_t thread_local_bytes = 1024 ;
-
-        Impl::OpenMPExec::resize_thread_data( pool_reduce_bytes
-                                            , team_reduce_bytes
-                                            , team_shared_bytes
-                                            , thread_local_bytes
-                                            );
-      }
-    }
-  }
-
-  if ( is_initialized || thread_spawn_failed ) {
-    std::string msg("Kokkos::OpenMP::initialize ERROR");
-
-    if ( is_initialized ) { msg.append(" : already initialized"); }
-    if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }

+void OpenMP::initialize( int thread_count )
+{
+  if ( omp_in_parallel() ) {
+    std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel");
    Kokkos::Impl::throw_runtime_exception(msg);
  }

+  if ( Impl::t_openmp_instance )
+  {
+    finalize();
+  }
+
+  {
+    if (nullptr == std::getenv("OMP_PROC_BIND") ) {
+      printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
+      printf("  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
+      printf("  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
+      printf("  For unit testing set OMP_PROC_BIND=false\n");
+    }
+
+    OpenMP::memory_space space ;
+
+    // Before any other call to OMP query the maximum number of threads
+    // and save the value for re-initialization unit testing.
+
+    Impl::g_openmp_hardware_max_threads = get_current_max_threads();
+
+    int process_num_threads = Impl::g_openmp_hardware_max_threads;
+
+    if ( Kokkos::hwloc::available() ) {
+      process_num_threads = Kokkos::hwloc::get_available_numa_count()
+                          * Kokkos::hwloc::get_available_cores_per_numa()
+                          * Kokkos::hwloc::get_available_threads_per_core();
+    }
+
+    // if thread_count  < 0, use g_openmp_hardware_max_threads;
+    // if thread_count == 0, set g_openmp_hardware_max_threads to process_num_threads
+    // if thread_count  > 0, set g_openmp_hardware_max_threads to thread_count
+    if (thread_count < 0 ) {
+      thread_count = Impl::g_openmp_hardware_max_threads;
+    }
+    else if( thread_count == 0 && Impl::g_openmp_hardware_max_threads != process_num_threads ) {
+      Impl::g_openmp_hardware_max_threads = process_num_threads;
+      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
+    }
+    else {
+      if( thread_count > process_num_threads ) {
+        printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
+        printf( "  process threads available : %3d,  requested thread : %3d\n", process_num_threads, thread_count );
+      }
+      Impl::g_openmp_hardware_max_threads = thread_count;
+      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
+    }
+
+    // setup thread local
+    #pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads)
+    {
+      Impl::t_openmp_instance = nullptr;
+      Impl::t_openmp_hardware_id = omp_get_thread_num();
+      Impl::SharedAllocationRecord< void, void >::tracking_enable();
+    }
+
+    void * const ptr = space.allocate( sizeof(Impl::OpenMPExec) );
+
+    Impl::t_openmp_instance = new (ptr) Impl::OpenMPExec( Impl::g_openmp_hardware_max_threads );
+
+    // New, unified host thread team data:
+    {
+      size_t pool_reduce_bytes  =   32 * thread_count ;
+      size_t team_reduce_bytes  =   32 * thread_count ;
+      size_t team_shared_bytes  = 1024 * thread_count ;
+      size_t thread_local_bytes = 1024 ;
+
+      Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
+                                                 , team_reduce_bytes
+                                                 , team_shared_bytes
+                                                 , thread_local_bytes
+                                                 );
+    }
+  }
+
+
  // Check for over-subscription
  //if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
  //  std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
@ -373,20 +382,38 @@ void OpenMP::initialize( unsigned thread_count ,

 void OpenMP::finalize()
 {
-  Impl::OpenMPExec::verify_initialized( "OpenMP::finalize" );
-  Impl::OpenMPExec::verify_is_process( "OpenMP::finalize" );
+  if ( omp_in_parallel() )
+  {
+    std::string msg("Kokkos::OpenMP::finalize ERROR ");
+    if( !Impl::t_openmp_instance ) msg.append(": not initialized");
+    if( omp_in_parallel() ) msg.append(": in parallel");
+    Kokkos::Impl::throw_runtime_exception(msg);
+  }

-  // New, unified host thread team data:
-  Impl::OpenMPExec::clear_thread_data();
+  if ( Impl::t_openmp_instance ) {

-  Impl::OpenMPExec::m_pool_topo[0] = 0 ;
-  Impl::OpenMPExec::m_pool_topo[1] = 0 ;
-  Impl::OpenMPExec::m_pool_topo[2] = 0 ;
+    const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads
+                       ? Impl::g_openmp_hardware_max_threads
+                       : Impl::t_openmp_instance->m_pool_size;

-  omp_set_num_threads(1);
+    using Exec = Impl::OpenMPExec;
+    Exec * instance = Impl::t_openmp_instance;
+    instance->~Exec();

-  if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
-    hwloc::unbind_this_thread();
+    OpenMP::memory_space space;
+    space.deallocate( instance, sizeof(Exec) );
+
+    #pragma omp parallel num_threads(nthreads)
+    {
+      Impl::t_openmp_hardware_id = 0;
+      Impl::t_openmp_instance    = nullptr;
+      Impl::SharedAllocationRecord< void, void >::tracking_disable();
+    }
+
+    // allow main thread to track
+    Impl::SharedAllocationRecord< void, void >::tracking_enable();
+
+    Impl::g_openmp_hardware_max_threads = 1;
  }

  #if defined(KOKKOS_ENABLE_PROFILING)
@ -396,70 +423,48 @@ void OpenMP::finalize()

 //----------------------------------------------------------------------------

-void OpenMP::print_configuration( std::ostream & s , const bool detail )
+void OpenMP::print_configuration( std::ostream & s , const bool verbose )
 {
-  Impl::OpenMPExec::verify_is_process( "OpenMP::print_configuration" );
-
  s << "Kokkos::OpenMP" ;

-#if defined( KOKKOS_ENABLE_OPENMP )
-  s << " KOKKOS_ENABLE_OPENMP" ;
-#endif
-#if defined( KOKKOS_ENABLE_HWLOC )
-
-  const unsigned numa_count_       = Kokkos::hwloc::get_available_numa_count();
-  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-  s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
-    << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
-    ;
-#endif
-
-  const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
+  const bool is_initialized =  Impl::t_openmp_instance != nullptr;

  if ( is_initialized ) {
-    const int numa_count      = Kokkos::Impl::OpenMPExec::m_pool_topo[0] / Kokkos::Impl::OpenMPExec::m_pool_topo[1] ;
-    const int core_per_numa   = Kokkos::Impl::OpenMPExec::m_pool_topo[1] / Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
-    const int thread_per_core = Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
+    Impl::OpenMPExec::verify_is_master( "OpenMP::print_configuration" );
+
+    const int numa_count      = 1;
+    const int core_per_numa   = Impl::g_openmp_hardware_max_threads;
+    const int thread_per_core = 1;

    s << " thread_pool_topology[ " << numa_count
      << " x " << core_per_numa
      << " x " << thread_per_core
      << " ]"
      << std::endl ;
-
-    if ( detail ) {
-      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPExec::m_pool_topo[0] );
-
-#pragma omp parallel
-      {
-#pragma omp critical
-        {
-          coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
-        }
-/* END #pragma omp critical */
-      }
-/* END #pragma omp parallel */
-
-      for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
-        s << "  thread omp_rank[" << i << "]"
-          << " kokkos_rank[" << Impl::OpenMPExec::m_map_rank[ i ] << "]"
-          << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
-          << std::endl ;
-      }
-    }
  }
  else {
    s << " not initialized" << std::endl ;
  }
 }

+std::vector<OpenMP> OpenMP::partition(...)
+{ return std::vector<OpenMP>(1); }
+
+OpenMP OpenMP::create_instance(...) { return OpenMP(); }
+
+
+#if !defined( KOKKOS_DISABLE_DEPRECATED )
+
 int OpenMP::concurrency() {
-  return thread_pool_size(0);
+  return Impl::g_openmp_hardware_max_threads;
 }

-const char* OpenMP::name() { return "OpenMP"; }
+void OpenMP::initialize( int thread_count , int, int )
+{
+  initialize(thread_count);
+}
+
+#endif

 } // namespace Kokkos

--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@ -47,6 +47,10 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_OPENMP )

+#if !defined(_OPENMP)
+#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
+#endif
+
 #include <Kokkos_OpenMP.hpp>

 #include <impl/Kokkos_Traits.hpp>
@ -54,6 +58,8 @@

 #include <Kokkos_Atomic.hpp>

+#include <Kokkos_UniqueToken.hpp>
+
 #include <iostream>
 #include <sstream>
 #include <fstream>
@ -63,8 +69,14 @@
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

-namespace Kokkos {
-namespace Impl {
+namespace Kokkos { namespace Impl {
+
+class OpenMPExec;
+
+extern int g_openmp_hardware_max_threads;
+
+extern __thread int t_openmp_hardware_id;
+extern __thread OpenMPExec * t_openmp_instance;

 //----------------------------------------------------------------------------
 /** \brief  Data for OpenMP thread execution */
@ -74,279 +86,279 @@ public:

  friend class Kokkos::OpenMP ;

-  enum { MAX_THREAD_COUNT = 4096 };
+  enum { MAX_THREAD_COUNT = 512 };
+
+  void clear_thread_data();
+
+  static void validate_partition( const int nthreads
+                                , int & num_partitions
+                                , int & partition_size
+                                );

 private:
+  OpenMPExec( int arg_pool_size )
+    : m_pool_size{ arg_pool_size }
+    , m_level{ omp_get_level() }
+    , m_pool()
+  {}

-  static int          m_pool_topo[ 4 ];
-  static int          m_map_rank[ MAX_THREAD_COUNT ];
+  ~OpenMPExec()
+  {
+    clear_thread_data();
+  }

-  static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
+  int m_pool_size;
+  int m_level;

-  static
-  void clear_thread_data();
+  HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];

 public:

-  // Topology of a cache coherent thread pool:
-  //   TOTAL = NUMA x GRAIN
-  //   pool_size( depth = 0 )
-  //   pool_size(0) = total number of threads
-  //   pool_size(1) = number of threads per NUMA
-  //   pool_size(2) = number of threads sharing finest grain memory hierarchy
+  static void verify_is_master( const char * const );

-  inline static
-  int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
-
-  static void finalize();
-
-  static void initialize( const unsigned team_count ,
-                          const unsigned threads_per_team ,
-                          const unsigned numa_count ,
-                          const unsigned cores_per_numa );
-
-  static void verify_is_process( const char * const );
-  static void verify_initialized( const char * const );
-
-
-  static
  void resize_thread_data( size_t pool_reduce_bytes
                         , size_t team_reduce_bytes
                         , size_t team_shared_bytes
                         , size_t thread_local_bytes );

-  inline static
-  HostThreadTeamData * get_thread_data() noexcept
-    { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
+  inline
+  HostThreadTeamData * get_thread_data() const noexcept
+  { return m_pool[ m_level == omp_get_level() ? 0 : omp_get_thread_num() ]; }

-  inline static
-  HostThreadTeamData * get_thread_data( int i ) noexcept
-    { return m_pool[i]; }
+  inline
+  HostThreadTeamData * get_thread_data( int i ) const noexcept
+  { return m_pool[i]; }
 };

-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class ... Properties >
-class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
-{
-public:
-
-  //! Tag this class as a kokkos execution policy
-  typedef TeamPolicyInternal      execution_policy ;
-
-  typedef PolicyTraits<Properties ... > traits;
-
-  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
-    m_league_size = p.m_league_size;
-    m_team_size = p.m_team_size;
-    m_team_alloc = p.m_team_alloc;
-    m_team_iter = p.m_team_iter;
-    m_team_scratch_size[0] = p.m_team_scratch_size[0];
-    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
-    m_team_scratch_size[1] = p.m_team_scratch_size[1];
-    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
-    m_chunk_size = p.m_chunk_size;
-    return *this;
-  }
-
-  //----------------------------------------
-
-  template< class FunctorType >
-  inline static
-  int team_size_max( const FunctorType & ) {
-      int pool_size = traits::execution_space::thread_pool_size(1);
-      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
-      return pool_size<max_host_team_size?pool_size:max_host_team_size;
-    }
-
-  template< class FunctorType >
-  inline static
-  int team_size_recommended( const FunctorType & )
-    { return traits::execution_space::thread_pool_size(2); }
-
-  template< class FunctorType >
-  inline static
-  int team_size_recommended( const FunctorType &, const int& )
-    { return traits::execution_space::thread_pool_size(2); }
-
-  //----------------------------------------
-
-private:
-
-  int m_league_size ;
-  int m_team_size ;
-  int m_team_alloc ;
-  int m_team_iter ;
-
-  size_t m_team_scratch_size[2];
-  size_t m_thread_scratch_size[2];
-
-  int m_chunk_size;
-
-  inline void init( const int league_size_request
-                  , const int team_size_request )
-    {
-      const int pool_size  = traits::execution_space::thread_pool_size(0);
-      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
-      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
-      const int team_grain = traits::execution_space::thread_pool_size(2);
-
-      m_league_size = league_size_request ;
-
-      m_team_size = team_size_request < team_max ?
-                    team_size_request : team_max ;
-
-      // Round team size up to a multiple of 'team_gain'
-      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
-      const int team_count      = pool_size / team_size_grain ;
-
-      // Constraint : pool_size = m_team_alloc * team_count
-      m_team_alloc = pool_size / team_count ;
-
-      // Maxumum number of iterations each team will take:
-      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
-
-      set_auto_chunk_size();
-    }
-
-public:
-
-  inline int team_size()   const { return m_team_size ; }
-  inline int league_size() const { return m_league_size ; }
-
-  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
-    if(team_size_ < 0) team_size_ = m_team_size;
-    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
-  }
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicyInternal( typename traits::execution_space &
-            , int league_size_request
-            , int team_size_request
-            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size { 0 , 0 }
-            , m_chunk_size(0)
-    { init( league_size_request , team_size_request ); }
-
-  TeamPolicyInternal( typename traits::execution_space &
-            , int league_size_request
-            , const Kokkos::AUTO_t & /* team_size_request */
-            , int /* vector_length_request */ = 1)
-            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size { 0 , 0 }
-            , m_chunk_size(0)
-    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
-
-  TeamPolicyInternal( int league_size_request
-            , int team_size_request
-            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size { 0 , 0 }
-            , m_chunk_size(0)
-    { init( league_size_request , team_size_request ); }
-
-  TeamPolicyInternal( int league_size_request
-            , const Kokkos::AUTO_t & /* team_size_request */
-            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size { 0 , 0 }
-            , m_chunk_size(0)
-    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
-
-  inline int team_alloc() const { return m_team_alloc ; }
-  inline int team_iter()  const { return m_team_iter ; }
-
-  inline int chunk_size() const { return m_chunk_size ; }
-
-  /** \brief set chunk_size to a discrete value*/
-  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
-    TeamPolicyInternal p = *this;
-    p.m_chunk_size = chunk_size_;
-    return p;
-  }
-
-  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
-    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size[level] = per_team.value;
-    return p;
-  };
-
-  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
-    TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size[level] = per_thread.value;
-    return p;
-  };
-
-  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
-    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size[level] = per_team.value;
-    p.m_thread_scratch_size[level] = per_thread.value;
-    return p;
-  };
-
-private:
-  /** \brief finalize chunk_size if it was set to AUTO*/
-  inline void set_auto_chunk_size() {
-
-    int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
-    if( concurrency==0 ) concurrency=1;
-
-    if(m_chunk_size > 0) {
-      if(!Impl::is_integral_power_of_two( m_chunk_size ))
-        Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
-    }
-
-    int new_chunk_size = 1;
-    while(new_chunk_size*100*concurrency < m_league_size)
-      new_chunk_size *= 2;
-    if(new_chunk_size < 128) {
-      new_chunk_size = 1;
-      while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
-        new_chunk_size*=2;
-    }
-    m_chunk_size = new_chunk_size;
-  }
-
-public:
-  typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
-};
-} // namespace Impl
-
-} // namespace Kokkos
+}} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {

-inline
-bool OpenMP::in_parallel()
-{ return omp_in_parallel(); }
+inline OpenMP::OpenMP() noexcept
+{}

 inline
-int OpenMP::thread_pool_size( int depth )
+bool OpenMP::is_initialized() noexcept
+{ return Impl::t_openmp_instance != nullptr; }
+
+inline
+bool OpenMP::in_parallel( OpenMP const& ) noexcept
 {
-  return Impl::OpenMPExec::pool_size(depth);
+  //t_openmp_instance is only non-null on a master thread
+  return   !Impl::t_openmp_instance
+         || Impl::t_openmp_instance->m_level < omp_get_level()
+         ;
+}
+
+inline
+int OpenMP::thread_pool_size() noexcept
+{
+  return   OpenMP::in_parallel()
+         ? omp_get_num_threads()
+         : Impl::t_openmp_instance->m_pool_size
+         ;
 }

 KOKKOS_INLINE_FUNCTION
-int OpenMP::thread_pool_rank()
+int OpenMP::thread_pool_rank() noexcept
 {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-  return Impl::OpenMPExec::m_map_rank[ omp_get_thread_num() ];
+  return Impl::t_openmp_instance ? 0 : omp_get_thread_num();
 #else
  return -1 ;
 #endif
 }

+inline
+void OpenMP::fence( OpenMP const& instance ) noexcept {}
+
+inline
+bool OpenMP::is_asynchronous( OpenMP const& instance ) noexcept
+{ return false; }
+
+template <typename F>
+void OpenMP::partition_master( F const& f
+                             , int num_partitions
+                             , int partition_size
+                             )
+{
+  if (omp_get_nested()) {
+    using Exec = Impl::OpenMPExec;
+
+    Exec * prev_instance = Impl::t_openmp_instance;
+
+    Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size );
+
+    OpenMP::memory_space space;
+
+    #pragma omp parallel num_threads(num_partitions)
+    {
+      void * const ptr = space.allocate( sizeof(Exec) );
+
+      Impl::t_openmp_instance = new (ptr) Exec( partition_size );
+
+      size_t pool_reduce_bytes  =   32 * partition_size ;
+      size_t team_reduce_bytes  =   32 * partition_size ;
+      size_t team_shared_bytes  = 1024 * partition_size ;
+      size_t thread_local_bytes = 1024 ;
+
+      Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
+                                                 , team_reduce_bytes
+                                                 , team_shared_bytes
+                                                 , thread_local_bytes
+                                                 );
+
+      f( omp_get_thread_num(), omp_get_num_threads() );
+
+      Impl::t_openmp_instance->~Exec();
+      space.deallocate( Impl::t_openmp_instance, sizeof(Exec) );
+      Impl::t_openmp_instance = nullptr;
+    }
+
+    Impl::t_openmp_instance  = prev_instance;
+  }
+  else {
+    // nested openmp not enabled
+    f(0,1);
+  }
+}
+
+
+namespace Experimental {
+
+template<>
+class MasterLock<OpenMP>
+{
+public:
+  void lock()     { omp_set_lock( &m_lock );   }
+  void unlock()   { omp_unset_lock( &m_lock ); }
+  bool try_lock() { return static_cast<bool>(omp_test_lock( &m_lock )); }
+
+  MasterLock()  { omp_init_lock( &m_lock ); }
+  ~MasterLock() { omp_destroy_lock( &m_lock ); }
+
+  MasterLock( MasterLock const& ) = delete;
+  MasterLock( MasterLock && )     = delete;
+  MasterLock & operator=( MasterLock const& ) = delete;
+  MasterLock & operator=( MasterLock && )     = delete;
+
+private:
+  omp_lock_t m_lock;
+
+};
+
+template<>
+class UniqueToken< OpenMP, UniqueTokenScope::Instance>
+{
+public:
+  using execution_space = OpenMP;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::OpenMP::thread_pool_size();
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const  noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::OpenMP::thread_pool_rank();
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release( int ) const noexcept {}
+};
+
+template<>
+class UniqueToken< OpenMP, UniqueTokenScope::Global>
+{
+public:
+  using execution_space = OpenMP;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::Impl::g_openmp_hardware_max_threads ;
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::Impl::t_openmp_hardware_id ;
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release( int ) const noexcept {}
+};
+
+} // namespace Experimental
+
+
+#if !defined( KOKKOS_DISABLE_DEPRECATED )
+
+inline
+int OpenMP::thread_pool_size( int depth )
+{
+  return depth < 2
+         ? thread_pool_size()
+         : 1;
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMP::hardware_thread_id() noexcept
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  return Impl::t_openmp_hardware_id;
+#else
+  return -1 ;
+#endif
+}
+
+inline
+int OpenMP::max_hardware_threads() noexcept
+{
+  return Impl::g_openmp_hardware_max_threads;
+}
+
+#endif // KOKKOS_DISABLE_DEPRECATED
+
 } // namespace Kokkos

 #endif
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@ -52,6 +52,8 @@
 #include <OpenMP/Kokkos_OpenMP_Exec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>

+#include <KokkosExp_MDRangePolicy.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

@ -71,8 +73,9 @@ private:
  typedef typename Policy::WorkRange    WorkRange ;
  typedef typename Policy::member_type  Member ;

-  const FunctorType m_functor ;
-  const Policy      m_policy ;
+        OpenMPExec   * m_instance ;
+  const FunctorType    m_functor ;
+  const Policy         m_policy ;

  template< class TagType >
  inline static
@ -110,16 +113,120 @@ private:
 public:

  inline void execute() const
+  {
+    enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+         , Kokkos::Dynamic >::value
+         };
+
+    if ( OpenMP::in_parallel() ) {
+      exec_range< WorkTag >( m_functor
+                           , m_policy.begin()
+                           , m_policy.end() );
+    }
+    else {
+
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+            , m_policy.chunk_size() );
+
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        std::pair<int64_t,int64_t> range(0,0);
+
+        do {
+
+          range = is_dynamic ? data.get_work_stealing_chunk()
+            : data.get_work_partition();
+
+          ParallelFor::template
+            exec_range< WorkTag >( m_functor
+                , range.first  + m_policy.begin()
+                , range.second + m_policy.begin() );
+
+        } while ( is_dynamic && 0 <= range.first );
+      }
+    }
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    {}
+};
+
+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::OpenMP
+                 >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy         Policy ;
+  typedef typename MDRangePolicy::work_tag                  WorkTag ;
+
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+
+        OpenMPExec   * m_instance ;
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;  // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy 
+            , const FunctorType & functor
+            , const Member ibeg , const Member iend )
    {
+      #ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        iterate_type( mdr_policy, functor )( iwork );
+      }
+    }
+
+public:
+
+  inline void execute() const
+  {
      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
                                      , Kokkos::Dynamic >::value };

-      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
+    if ( OpenMP::in_parallel() ) {
+      ParallelFor::exec_range ( m_mdr_policy
+                              , m_functor
+                              , m_policy.begin()
+                              , m_policy.end() );
+    }
+    else {

-#pragma omp parallel
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
      {
-        HostThreadTeamData & data = *OpenMPExec::get_thread_data();
+        HostThreadTeamData & data = *(m_instance->get_thread_data());

        data.set_work_partition( m_policy.end() - m_policy.begin()
                               , m_policy.chunk_size() );
@ -136,8 +243,8 @@ public:
          range = is_dynamic ? data.get_work_stealing_chunk()
                             : data.get_work_partition();

-          ParallelFor::template
-            exec_range< WorkTag >( m_functor
+          ParallelFor::exec_range( m_mdr_policy 
+                                 , m_functor
                                 , range.first  + m_policy.begin()
                                 , range.second + m_policy.begin() );

@ -145,12 +252,15 @@ public:
      }
      // END #pragma omp parallel
    }
+  }

  inline
  ParallelFor( const FunctorType & arg_functor
-             , Policy arg_policy )
-    : m_functor( arg_functor )
-    , m_policy(  arg_policy )
+             , MDRangePolicy arg_policy )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
    {}
 };

@ -191,10 +301,11 @@ private:
  typedef typename Analysis::pointer_type    pointer_type ;
  typedef typename Analysis::reference_type  reference_type ;

-  const FunctorType   m_functor ;
-  const Policy        m_policy ;
-  const ReducerType   m_reducer ;
-  const pointer_type  m_result_ptr ;
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
+  const ReducerType    m_reducer;
+  const pointer_type   m_result_ptr;

  template< class TagType >
  inline static
@ -228,21 +339,21 @@ public:
      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
                                      , Kokkos::Dynamic >::value };

-      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");

      const size_t pool_reduce_bytes =
        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));

-      OpenMPExec::resize_thread_data( pool_reduce_bytes
+      m_instance->resize_thread_data( pool_reduce_bytes
                                    , 0 // team_reduce_bytes
                                    , 0 // team_shared_bytes
                                    , 0 // thread_local_bytes
                                    );

-#pragma omp parallel
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
      {
-        HostThreadTeamData & data = *OpenMPExec::get_thread_data();
+        HostThreadTeamData & data = *(m_instance->get_thread_data());

        data.set_work_partition( m_policy.end() - m_policy.begin()
                               , m_policy.chunk_size() );
@ -271,16 +382,15 @@ public:

        } while ( is_dynamic && 0 <= range.first );
      }
-// END #pragma omp parallel

      // Reduction:

-      const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
+      const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );

-      for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
+      for ( int i = 1 ; i < pool_size ; ++i ) {
        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
                       , ptr
-                       , OpenMPExec::get_thread_data(i)->pool_reduce_local() );
+                       , m_instance->get_thread_data(i)->pool_reduce_local() );
      }

      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
@ -303,7 +413,8 @@ public:
                           Kokkos::is_view< ViewType >::value &&
                           !Kokkos::is_reducer_type<ReducerType>::value
                  ,void*>::type = NULL)
-    : m_functor( arg_functor )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( InvalidType() )
    , m_result_ptr(  arg_view.data() )
@ -317,7 +428,8 @@ public:
  ParallelReduce( const FunctorType & arg_functor
                , Policy       arg_policy
                , const ReducerType& reducer )
-    : m_functor( arg_functor )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( reducer )
    , m_result_ptr(  reducer.view().data() )
@ -329,6 +441,173 @@ public:

 };

+
+// MDRangePolicy impl
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::Experimental::MDRangePolicy< Traits ...>
+                    , ReducerType
+                    , Kokkos::OpenMP
+                    >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy         Policy ;
+
+  typedef typename MDRangePolicy::work_tag                  WorkTag ;
+  typedef typename Policy::WorkRange                        WorkRange ;
+  typedef typename Policy::member_type                      Member ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef typename ReducerTypeFwd::value_type ValueType; 
+
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+                                                                           , FunctorType
+                                                                           , WorkTag
+                                                                           , ValueType
+                                                                           >;
+
+        OpenMPExec   * m_instance ;
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;     // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy
+            , const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        iterate_type( mdr_policy, functor, update )( iwork );
+      }
+    }
+
+public:
+
+  inline void execute() const
+    {
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };
+
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
+
+      const size_t pool_reduce_bytes =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      m_instance->resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );
+
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        reference_type update =
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                         , data.pool_reduce_local() );
+
+        std::pair<int64_t,int64_t> range(0,0);
+
+        do {
+
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
+
+          ParallelReduce::exec_range ( m_mdr_policy, m_functor
+                                     , range.first  + m_policy.begin()
+                                     , range.second + m_policy.begin()
+                                     , update );
+
+        } while ( is_dynamic && 0 <= range.first );
+      }
+// END #pragma omp parallel
+
+      // Reduction:
+
+      const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
+
+      for ( int i = 1 ; i < pool_size ; ++i ) {
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , m_instance->get_thread_data(i)->pool_reduce_local() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      if ( m_result_ptr ) {
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
+
+        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
+      }
+    }
+
+  //----------------------------------------
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ViewType    & arg_view
+                , typename std::enable_if<
+                           Kokkos::is_view< ViewType >::value &&
+                           !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( InvalidType() )
+    , m_result_ptr(  arg_view.data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ReducerType& reducer )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
 } // namespace Impl
 } // namespace Kokkos

@ -361,8 +640,9 @@ private:
  typedef typename Analysis::pointer_type    pointer_type ;
  typedef typename Analysis::reference_type  reference_type ;

-  const FunctorType   m_functor ;
-  const Policy        m_policy ;
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;

  template< class TagType >
  inline static
@ -394,23 +674,23 @@ public:
  inline
  void execute() const
    {
-      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_scan");
-      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_scan");
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan");

      const int    value_count       = Analysis::value_count( m_functor );
      const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );

-      OpenMPExec::resize_thread_data( pool_reduce_bytes
+      m_instance->resize_thread_data( pool_reduce_bytes
                                    , 0 // team_reduce_bytes
                                    , 0 // team_shared_bytes
                                    , 0 // thread_local_bytes
                                    );

-#pragma omp parallel
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
      {
-        HostThreadTeamData & data = *OpenMPExec::get_thread_data();
+        HostThreadTeamData & data = *(m_instance->get_thread_data());

-        const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
+        const WorkRange range( m_policy, omp_get_thread_num(), omp_get_num_threads() );

        reference_type update_sum =
          ValueInit::init( m_functor , data.pool_reduce_local() );
@ -422,7 +702,7 @@ public:

          pointer_type ptr_prev = 0 ;

-          const int n = data.pool_size();
+          const int n = omp_get_num_threads();

          for ( int i = 0 ; i < n ; ++i ) {

@ -452,7 +732,6 @@ public:
        ParallelScan::template exec_range< WorkTag >
          ( m_functor , range.begin() , range.end() , update_base , true );
      }
-/* END #pragma omp parallel */

    }

@ -461,7 +740,8 @@ public:
  inline
  ParallelScan( const FunctorType & arg_functor
              , const Policy      & arg_policy )
-    : m_functor( arg_functor )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
    , m_policy(  arg_policy )
  {}

@ -492,9 +772,10 @@ private:
  typedef typename Policy::schedule_type::type  SchedTag ;
  typedef typename Policy::member_type          Member ;

-  const FunctorType  m_functor ;
-  const Policy       m_policy ;
-  const int          m_shmem_size ;
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
+  const int            m_shmem_size;

  template< class TagType >
  inline static
@ -548,22 +829,22 @@ public:
    {
      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };

-      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");

      const size_t pool_reduce_size = 0 ; // Never shrinks
      const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
      const size_t thread_local_size = 0 ; // Never shrinks

-      OpenMPExec::resize_thread_data( pool_reduce_size
+      m_instance->resize_thread_data( pool_reduce_size
                                    , team_reduce_size
                                    , team_shared_size
                                    , thread_local_size );

-#pragma omp parallel
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
      {
-        HostThreadTeamData & data = *OpenMPExec::get_thread_data();
+        HostThreadTeamData & data = *(m_instance->get_thread_data());

        const int active = data.organize_team( m_policy.team_size() );

@ -598,14 +879,14 @@ public:

        data.disband_team();
      }
-// END #pragma omp parallel
    }


  inline
  ParallelFor( const FunctorType & arg_functor ,
               const Policy      & arg_policy )
-    : m_functor( arg_functor )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_shmem_size( arg_policy.scratch_size(0) +
                    arg_policy.scratch_size(1) +
@ -646,11 +927,12 @@ private:
  typedef typename Analysis::pointer_type    pointer_type ;
  typedef typename Analysis::reference_type  reference_type ;

-  const FunctorType  m_functor ;
-  const Policy       m_policy ;
-  const ReducerType  m_reducer ;
-  const pointer_type m_result_ptr ;
-  const int          m_shmem_size ;
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
+  const ReducerType    m_reducer;
+  const pointer_type   m_result_ptr;
+  const int            m_shmem_size;

  template< class TagType >
  inline static
@ -706,8 +988,7 @@ public:
    {
      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };

-      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");

      const size_t pool_reduce_size =
        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
@ -716,14 +997,15 @@ public:
      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
      const size_t thread_local_size = 0 ; // Never shrinks

-      OpenMPExec::resize_thread_data( pool_reduce_size
+      m_instance->resize_thread_data( pool_reduce_size
                                    , team_reduce_size
                                    , team_shared_size
                                    , thread_local_size );

-#pragma omp parallel
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
      {
-        HostThreadTeamData & data = *OpenMPExec::get_thread_data();
+        HostThreadTeamData & data = *(m_instance->get_thread_data());

        const int active = data.organize_team( m_policy.team_size() );

@ -763,17 +1045,26 @@ public:
        }

        data.disband_team();
+
+        //  This thread has updated 'pool_reduce_local()' with its
+        //  contributions to the reduction.  The parallel region is
+        //  about to terminate and the master thread will load and
+        //  reduce each 'pool_reduce_local()' contribution.
+        //  Must 'memory_fence()' to guarantee that storing the update to
+        //  'pool_reduce_local()' will complete before this thread
+        //  exits the parallel region.
+
+        memory_fence();
      }
-// END #pragma omp parallel

      // Reduction:

-      const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
+      const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );

-      for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
+      for ( int i = 1 ; i < pool_size ; ++i ) {
        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
                       , ptr
-                       , OpenMPExec::get_thread_data(i)->pool_reduce_local() );
+                       , m_instance->get_thread_data(i)->pool_reduce_local() );
      }

      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
@ -796,7 +1087,8 @@ public:
                    Kokkos::is_view< ViewType >::value &&
                    !Kokkos::is_reducer_type<ReducerType>::value
                    ,void*>::type = NULL)
-    : m_functor( arg_functor )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result.ptr_on_device() )
@ -810,7 +1102,8 @@ public:
  ParallelReduce( const FunctorType & arg_functor
    , Policy       arg_policy
    , const ReducerType& reducer )
-  : m_functor( arg_functor )
+  : m_instance( t_openmp_instance )
+  , m_functor( arg_functor )
  , m_policy(  arg_policy )
  , m_reducer( reducer )
  , m_result_ptr(  reducer.view().data() )
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@ -105,7 +105,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
 {
  using execution_space = Kokkos::OpenMP ;
  using queue_type      = TaskQueue< execution_space > ;
-  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using task_root_type  = TaskBase< void , void , void > ;
  using Member          = Impl::HostThreadTeamMember< execution_space > ;

  static task_root_type * const end =
@ -115,23 +115,19 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
  HostThreadTeamData & team_data_single =
    HostThreadTeamDataSingleton::singleton();

-  const int team_size = Impl::OpenMPExec::pool_size(2); // Threads per core
-  // const int team_size = Impl::OpenMPExec::pool_size(1); // Threads per NUMA
+  Impl::OpenMPExec * instance = t_openmp_instance;
+  const int pool_size = OpenMP::thread_pool_size();

-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
-fflush(stdout);
-#endif
+  const int team_size = 1;  // Threads per core
+  instance->resize_thread_data( 0 /* global reduce buffer */
+                              , 512 * team_size /* team reduce buffer */
+                              , 0 /* team shared buffer */
+                              , 0 /* thread local buffer */
+                              );

-  OpenMPExec::resize_thread_data( 0 /* global reduce buffer */
-                                , 512 * team_size /* team reduce buffer */
-                                , 0 /* team shared buffer */
-                                , 0 /* thread local buffer */
-                                );
-
-#pragma omp parallel
+  #pragma omp parallel num_threads(pool_size)
  {
-    Impl::HostThreadTeamData & self = *Impl::OpenMPExec::get_thread_data();
+    Impl::HostThreadTeamData & self = *(instance->get_thread_data());

    // Organizing threads into a team performs a barrier across the
    // entire pool to insure proper initialization of the team
@ -142,18 +138,6 @@ fflush(stdout);
      Member single_exec( team_data_single );
      Member team_exec( self );

-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) running\n"
-       , self.pool_rank()
-       , self.pool_size()
-       , team_exec.team_rank()
-       , team_exec.team_size()
-       , team_exec.league_rank()
-       , team_exec.league_size()
-       );
-fflush(stdout);
-#endif
-
      // Loop until all queues are empty and no tasks in flight

      task_root_type * task = 0 ;
@ -197,15 +181,6 @@ fflush(stdout);

              // if a single thread task then execute now

-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) executing single task 0x%lx\n"
-       , self.pool_rank()
-       , self.pool_size()
-       , int64_t(task)
-       );
-fflush(stdout);
-#endif
-
              (*task->m_apply)( task , & single_exec );

              leader_loop = true ;
@ -220,57 +195,14 @@ fflush(stdout);

        if ( 0 != task ) { // Thread Team Task

-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n"
-       , self.pool_rank()
-       , self.pool_size()
-       , team_exec.team_rank()
-       , team_exec.team_size()
-       , team_exec.league_rank()
-       , team_exec.league_size()
-       , int64_t(task)
-       );
-fflush(stdout);
-#endif
-
          (*task->m_apply)( task , & team_exec );

          // The m_apply function performs a barrier
        }
      } while( 0 != task );
-
-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) ending\n"
-       , self.pool_rank()
-       , self.pool_size()
-       , team_exec.team_rank()
-       , team_exec.team_size()
-       , team_exec.league_rank()
-       , team_exec.league_size()
-       );
-fflush(stdout);
-#endif
-
    }
-
    self.disband_team();
-
-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) disbanded\n"
-       , self.pool_rank()
-       , self.pool_size()
-       );
-fflush(stdout);
-#endif
-
  }
-// END #pragma omp parallel
-
-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> execute %d end\n", team_size );
-fflush(stdout);
-#endif
-
 }

 void TaskQueueSpecialization< Kokkos::OpenMP >::
@ -279,10 +211,10 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
 {
  using execution_space = Kokkos::OpenMP ;
  using queue_type      = TaskQueue< execution_space > ;
-  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using task_root_type  = TaskBase< void , void , void > ;
  using Member          = Impl::HostThreadTeamMember< execution_space > ;

-  if ( 1 == omp_get_num_threads() ) {
+  if ( 1 == OpenMP::thread_pool_size() ) {

    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;

--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@ -45,7 +45,7 @@
 #define KOKKOS_IMPL_OPENMP_TASK_HPP

 #include <Kokkos_Macros.hpp>
-#if defined( KOKKOS_ENABLE_TASKDAG )
+#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -60,7 +60,7 @@ public:

  using execution_space = Kokkos::OpenMP ;
  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
-  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< void , void , void > ;
  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;

  // Must specify memory space
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
@ -0,0 +1,245 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_TEAM_HPP
+#define KOKKOS_OPENMP_TEAM_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP )
+
+#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
+
+namespace Kokkos { namespace Impl {
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
+{
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_team_alloc = p.m_team_alloc;
+    m_team_iter = p.m_team_iter;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & ) {
+      int pool_size = traits::execution_space::thread_pool_size(1);
+      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      return pool_size<max_host_team_size?pool_size:max_host_team_size;
+    }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & )
+    { return traits::execution_space::thread_pool_size(2); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& )
+    { return traits::execution_space::thread_pool_size(2); }
+
+  //----------------------------------------
+
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+
+  int m_chunk_size;
+
+  inline void init( const int league_size_request
+                  , const int team_size_request )
+    {
+      const int pool_size  = traits::execution_space::thread_pool_size(0);
+      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
+      const int team_grain = traits::execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+
+      // Maxumum number of iterations each team will take:
+      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
+
+      set_auto_chunk_size();
+    }
+
+public:
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
+    if(team_size_ < 0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , int team_size_request
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request ); }
+
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1)
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
+
+  TeamPolicyInternal( int league_size_request
+            , int team_size_request
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request ); }
+
+  TeamPolicyInternal( int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
+
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int team_iter()  const { return m_team_iter ; }
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+private:
+  /** \brief finalize chunk_size if it was set to AUTO*/
+  inline void set_auto_chunk_size() {
+
+    int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
+    if( concurrency==0 ) concurrency=1;
+
+    if(m_chunk_size > 0) {
+      if(!Impl::is_integral_power_of_two( m_chunk_size ))
+        Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
+    }
+
+    int new_chunk_size = 1;
+    while(new_chunk_size*100*concurrency < m_league_size)
+      new_chunk_size *= 2;
+    if(new_chunk_size < 128) {
+      new_chunk_size = 1;
+      while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
+        new_chunk_size*=2;
+    }
+    m_chunk_size = new_chunk_size;
+  }
+
+public:
+  typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
+};
+
+}} // namespace Kokkos::Impl
+
+#endif
+#endif /* KOKKOS_OPENMP_TEAM_HPP */
+
+
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
@ -0,0 +1,107 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
+#define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::OpenMP
+                 >
+  : public Kokkos::Impl::Experimental::
+           WorkGraphExec< FunctorType,
+                          Kokkos::OpenMP,
+                          Traits ...
+                        >
+{
+private:
+
+  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
+  typedef Kokkos::Impl::Experimental::
+          WorkGraphExec<FunctorType, Kokkos::OpenMP, Traits ... > Base ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    Base::m_functor( i );
+  }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    const TagType t{} ;
+    Base::m_functor( t , i );
+  }
+
+public:
+
+  inline
+  void execute()
+  {
+    const int pool_size = OpenMP::thread_pool_size();
+
+    #pragma omp parallel num_threads(pool_size)
+    {
+      for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
+        exec_one< typename Policy::work_tag >( i );
+        Base::after_work(i);
+      }
+    }
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : Base( arg_functor, arg_policy )
+  {
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -45,7 +45,7 @@
 #define KOKKOS_OPENMPTARGETEXEC_HPP

 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_Spinwait.hpp>

 #include <Kokkos_Atomic.hpp>
 #include <iostream>
@ -59,10 +59,10 @@ namespace Impl {


 class OpenMPTargetExec {
-public: 
+public:
  enum { MAX_ACTIVE_THREADS = 256*8*56*4 };
  enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 };
-  
+
 private:
  static void* scratch_ptr;

@ -70,7 +70,7 @@ public:
  static void verify_is_process( const char * const );
  static void verify_initialized( const char * const );

-  static void* get_scratch_ptr();  
+  static void* get_scratch_ptr();
  static void clear_scratch();
  static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes );

@ -159,7 +159,7 @@ public:

  KOKKOS_INLINE_FUNCTION void team_barrier() const
    {
-      #pragma omp barrier  
+      #pragma omp barrier
    }

  template<class ValueType>
@ -191,13 +191,13 @@ public:

      typedef ValueType value_type;
      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
-      
+
      // Make sure there is enough scratch space:
      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
                           , value_type , void >::type type ;

      const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type);
-      type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num()); 
+      type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num());
      for(int i = m_team_rank; i < n_values; i+= m_team_size) {
        team_scratch[i] = value_type();
      }
@ -209,7 +209,7 @@ public:
          team_scratch[m_team_rank%n_values]+=value;
        #pragma omp barrier
      }
-      
+
      for(int d = 1; d<n_values;d*=2) {
        if((m_team_rank+d<n_values) && (m_team_rank%(2*d)==0)) {
          team_scratch[m_team_rank] += team_scratch[m_team_rank+d];
@ -374,12 +374,12 @@ private:
  int m_chunk_size;

  inline void init( const int league_size_request
-                  , const int team_size_request 
+                  , const int team_size_request
                  , const int vector_length_request )
    {
      m_league_size = league_size_request ;

-      m_team_size = team_size_request; 
+      m_team_size = team_size_request;

      m_vector_length = vector_length_request;

--- a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
@ -47,7 +47,7 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_QTHREADS )

-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_Spinwait.hpp>

 //----------------------------------------------------------------------------

--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@ -45,14 +45,14 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_THREADS )

-#include <Kokkos_Core_fwd.hpp>
-
 #include <cstdint>
 #include <limits>
 #include <utility>
 #include <iostream>
 #include <sstream>
+
 #include <Kokkos_Core.hpp>
+
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_CPUDiscovery.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
@ -80,9 +80,7 @@ const void * volatile s_current_function_arg = 0 ;

 struct Sentinel {
  Sentinel()
-  {
-    HostSpace::register_in_parallel( ThreadsExec::in_parallel );
-  }
+  {}

  ~Sentinel()
  {
@ -122,6 +120,8 @@ void execute_function_noop( ThreadsExec & , const void * ) {}

 void ThreadsExec::driver(void)
 {
+  SharedAllocationRecord< void, void >::tracking_enable();
+
  ThreadsExec this_thread ;

  while ( ThreadsExec::Active == this_thread.m_pool_state ) {
@ -726,6 +726,8 @@ void ThreadsExec::initialize( unsigned thread_count ,
  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();

+  Impl::SharedAllocationRecord< void, void >::tracking_enable();
+
  #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
  #endif
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@ -50,11 +50,12 @@
 #include <cstdio>

 #include <utility>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>

 #include <Kokkos_Atomic.hpp>

+#include <Kokkos_UniqueToken.hpp>
 //----------------------------------------------------------------------------

 namespace Kokkos {
@ -275,6 +276,17 @@ public:
      if ( ! rev_rank ) {
        Final::final( f , reduce_memory() );
      }
+
+      //  This thread has updated 'reduce_memory()' and upon returning
+      //  from this function will set 'm_pool_state' to inactive.
+      //  If this is a non-root thread then setting 'm_pool_state'
+      //  to inactive triggers another thread to exit a spinwait
+      //  and read the 'reduce_memory'.
+      //  Must 'memory_fence()' to guarantee that storing the update to
+      //  'reduce_memory()' will complete before storing the the update to
+      //  'm_pool_state'.
+
+      memory_fence();
    }

  inline
@ -627,6 +639,62 @@ inline void Threads::fence()

 } /* namespace Kokkos */

+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Experimental {
+
+template<>
+class UniqueToken< Threads, UniqueTokenScope::Instance>
+{
+public:
+  using execution_space = Threads;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return Threads::thread_pool_size(); }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return Threads::thread_pool_rank(); }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+template<>
+class UniqueToken< Threads, UniqueTokenScope::Global>
+{
+public:
+  using execution_space = Threads;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return Threads::thread_pool_size(); }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return Threads::thread_pool_rank(); }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+}} // namespace Kokkos::Experimental
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 #endif
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@ -50,7 +50,7 @@
 #include <cstdio>

 #include <utility>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>

@ -482,6 +482,8 @@ public:
  void next_static()
    {
      if ( m_league_rank < m_league_end ) {
+        // Make sure all stores are complete before entering the barrier
+        memory_fence();
        team_barrier();
        set_team_shared();
      }
@ -518,6 +520,8 @@ public:
      return;

    if ( m_league_rank < m_league_chunk_end ) {
+      // Make sure all stores are complete before entering the barrier
+      memory_fence();
      team_barrier();
      set_team_shared();
    }
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@ -55,6 +55,8 @@
 #include <impl/Kokkos_StaticAssert.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>

+#include <KokkosExp_MDRangePolicy.hpp>
+
 //----------------------------------------------------------------------------

 namespace Kokkos {
@ -174,6 +176,108 @@ public:
    {}
 };

+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::Threads
+                 >
+{
+private:
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy         Policy ;
+
+  typedef typename MDRangePolicy::work_tag                  WorkTag ;
+
+  typedef typename Policy::WorkRange   WorkRange ;
+  typedef typename Policy::member_type Member ;
+
+  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;  // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy 
+            , const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        iterate_type( mdr_policy, functor )( i );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    exec_schedule<typename Policy::schedule_type::type>(exec,arg);
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    ParallelFor::exec_range
+      ( self.m_mdr_policy, self.m_functor , range.begin() , range.end() );
+
+    exec.fan_in();
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+
+    while(work_index != -1) {
+      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
+
+      ParallelFor::exec_range
+        ( self.m_mdr_policy, self.m_functor , begin , end );
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::start( & ParallelFor::exec , this );
+      ThreadsExec::fence();
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const MDRangePolicy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    {}
+};
+
 //----------------------------------------------------------------------------
 /* ParallelFor Kokkos::Threads with TeamPolicy */

@ -440,6 +544,169 @@ public:

 };

+
+// MDRangePolicy impl
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Threads
+                    >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy Policy ;
+
+  typedef typename MDRangePolicy::work_tag    WorkTag ;
+  typedef typename Policy::WorkRange   WorkRange ;
+  typedef typename Policy::member_type Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef typename ReducerTypeFwd::value_type ValueType; 
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+                                                                           , FunctorType
+                                                                           , WorkTag
+                                                                           , ValueType
+                                                                           >;
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;  // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy
+            , const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update )
+    {
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        iterate_type( mdr_policy, functor, update )( i );
+      }
+    }
+
+  static void
+  exec( ThreadsExec & exec , const void * arg ) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    ParallelReduce::exec_range
+      ( self.m_mdr_policy, self.m_functor , range.begin() , range.end()
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+    exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+    reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
+    while(work_index != -1) {
+      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
+      ParallelReduce::exec_range
+        ( self.m_mdr_policy, self.m_functor , begin , end
+        , update );
+      work_index = exec.get_work_index();
+    }
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+
+      ThreadsExec::start( & ParallelReduce::exec , this );
+
+      ThreadsExec::fence();
+
+      if ( m_result_ptr ) {
+
+        const pointer_type data =
+          (pointer_type) ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const MDRangePolicy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.ptr_on_device() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Threads reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Threads reduce result must be a View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
+
 //----------------------------------------------------------------------------
 /* ParallelReduce with Kokkos::Threads and TeamPolicy */

--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
@ -0,0 +1,115 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
+#define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::Threads
+                 >
+  : public Kokkos::Impl::Experimental::
+           WorkGraphExec< FunctorType,
+                          Kokkos::Threads,
+                          Traits ...
+                        >
+{
+private:
+
+  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
+  typedef Kokkos::Impl::Experimental::
+          WorkGraphExec<FunctorType, Kokkos::Threads, Traits ... > Base ;
+  typedef ParallelFor<FunctorType,
+                      Kokkos::Experimental::WorkGraphPolicy<Traits ...>,
+                      Kokkos::Threads> Self ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    Base::m_functor( i );
+  }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    const TagType t{} ;
+    Base::m_functor( t , i );
+  }
+
+  inline void exec_one_thread() const {
+    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
+      exec_one< typename Policy::work_tag >( i );
+      Base::after_work(i);
+    }
+  }
+
+  static inline void thread_main( ThreadsExec&, const void* arg ) {
+    const Self& self = *(static_cast<const Self*>(arg));
+    self.exec_one_thread();
+  }
+
+public:
+
+  inline
+  void execute()
+  {
+    ThreadsExec::start( & Self::thread_main, this );
+    ThreadsExec::fence();
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : Base( arg_functor, arg_policy )
+  {
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP */
--- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@ -141,7 +141,6 @@ namespace Kokkos { namespace Experimental { namespace Impl {
 #define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7]


-
 // New Loop Macros...
 // parallel_for, non-tagged
 #define APPLY( func, ... ) \
@ -1010,8 +1009,6 @@ namespace Kokkos { namespace Experimental { namespace Impl {
 // end tagged macros


-
-
 // Structs for calling loops
 template < int Rank, bool IsLeft, typename IType, typename Tagged, typename Enable = void >
 struct Tile_Loop_Type;
@ -1279,6 +1276,19 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i
 template <typename T>
 using is_void = std::is_same< T , void >;

+template <typename T>
+struct is_type_array : std::false_type 
+{
+  using value_type = T;
+};
+
+template <typename T>
+struct is_type_array< T[] > : std::true_type
+{
+  using value_type = T;
+};
+
+
 template < typename RP
         , typename Functor
         , typename Tag = void
@ -1761,18 +1771,17 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
  RP         const& m_rp;
  Functor    const& m_func;
  typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
-//  value_type  & m_v;
-
 };


-// ValueType: For reductions
+// For ParallelReduce
+// ValueType - scalar: For reductions
 template < typename RP
         , typename Functor
         , typename Tag
         , typename ValueType
         >
-struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value >::type >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type >
 {
  using index_type = typename RP::index_type;
  using point_type = typename RP::point_type;
@ -2251,12 +2260,497 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
 };


+// For ParallelReduce
+// Extra specialization for array reductions
+// ValueType[]: For array reductions
+template < typename RP
+         , typename Functor
+         , typename Tag
+         , typename ValueType
+         >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using point_type = typename RP::point_type;
+
+  using value_type = typename is_type_array<ValueType>::value_type; // strip away the 'array-ness' [], only underlying type remains
+
+  inline
+  HostIterateTile( RP const& rp, Functor const& func, value_type *v ) // v should be an array; treat as pointer for compatibility since size is not known nor needed here
+    : m_rp(rp) //Cuda 7.0 does not like braces...
+    , m_func(func)
+    , m_v(v) // use with non-void ValueType struct
+  {}
+
+  inline
+  bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
+    bool is_full_tile = true;
+
+      for ( int i = 0; i < RP::rank; ++i ) {
+        if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
+            partial_tile[i] = m_rp.m_tile[i] ;
+        }
+        else {
+          is_full_tile = false ;
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
+                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
+        }
+      }
+
+    return is_full_tile ;
+  } // end check bounds
+
+
+  template <int Rank>
+  struct RankTag
+  {
+    typedef RankTag type;
+    enum { value = (int)Rank };
+  };
+
+
+#if KOKKOS_ENABLE_NEW_LOOP_MACROS
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
+
+  }
+
+#else
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { operator_impl( tile_idx , RankTag<RP::rank>() ); }
+  // added due to compiler error when using sfinae to choose operator based on rank
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<2> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 2
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<3> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 3
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<4> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 4
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<5> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 5
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<6> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 6
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<7> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 7
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<8> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 8
+#endif
+
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func(args... , m_v);
+    }
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func( m_tag, args... , m_v);
+    }
+
+
+  RP         const& m_rp;
+  Functor    const& m_func;
+  value_type * m_v;
+  typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
+
+};
+
+
 // ------------------------------------------------------------------ //

 // MDFunctor - wraps the range_policy and functor to pass to IterateTile
-// Serial, Threads, OpenMP
+// Used for md_parallel_{for,reduce} with Serial, Threads, OpenMP
 // Cuda uses DeviceIterateTile directly within md_parallel_for
-// ParallelReduce
+// TODO Once md_parallel_{for,reduce} removed, this can be removed
+
+// ParallelReduce - scalar reductions
 template < typename MDRange, typename Functor, typename ValueType = void >
 struct MDFunctor
 {
@ -2273,7 +2767,7 @@ struct MDFunctor


  inline
-  MDFunctor( MDRange const& range, Functor const& f, ValueType & v )
+  MDFunctor( MDRange const& range, Functor const& f )
    : m_range( range )
    , m_func( f )
  {}
@ -2290,7 +2784,6 @@ struct MDFunctor
  inline
  MDFunctor& operator=( MDFunctor && ) = default;

-//  KOKKOS_FORCEINLINE_FUNCTION //Caused cuda warning - __host__ warning
  inline
  void operator()(index_type t, value_type & v) const
  {
@ -2301,6 +2794,56 @@ struct MDFunctor
  Functor   m_func;
 };

+
+// ParallelReduce - array reductions 
+template < typename MDRange, typename Functor, typename ValueType >
+struct MDFunctor< MDRange, Functor, ValueType[] >
+{
+  using range_policy = MDRange;
+  using functor_type = Functor;
+  using value_type   = ValueType[];
+  using work_tag     = typename range_policy::work_tag;
+  using index_type   = typename range_policy::index_type;
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
+                                                                           , Functor
+                                                                           , work_tag
+                                                                           , value_type
+                                                                           >;
+
+
+  inline
+  MDFunctor( MDRange const& range, Functor const& f )
+    : m_range( range )
+    , m_func( f )
+    , value_count( f.value_count )
+  {}
+
+  inline
+  MDFunctor( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor( MDFunctor && ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor && ) = default;
+
+  // FIXME Init and Join, as defined in m_func, are not working through the MDFunctor
+  // Best path forward is to eliminate need for MDFunctor, directly use MDRangePolicy within Parallel{For,Reduce} ??
+  inline
+  void operator()(index_type t, value_type v) const
+  {
+    iterate_type(m_range, m_func, v)(t);
+  }
+
+  MDRange   m_range;
+  Functor   m_func;
+  size_t    value_count;
+};
+
+
 // ParallelFor
 template < typename MDRange, typename Functor >
 struct MDFunctor< MDRange, Functor, void >
@ -2349,4 +2892,3 @@ struct MDFunctor< MDRange, Functor, void >
 } } } //end namespace Kokkos::Experimental::Impl

 #endif
-
--- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@ -55,16 +55,19 @@ template < typename ExecutionSpace   = void
         , typename WorkTag          = void
         , typename IndexType        = void
         , typename IterationPattern = void
+         , typename LaunchBounds     = void
         >
 struct PolicyTraitsBase
 {
-  using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
+  using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, 
+               IterationPattern, LaunchBounds>;

  using execution_space   = ExecutionSpace;
  using schedule_type     = Schedule;
  using work_tag          = WorkTag;
  using index_type        = IndexType;
  using iteration_pattern = IterationPattern;
+  using launch_bounds     = LaunchBounds;
 };


@ -78,6 +81,7 @@ struct SetExecutionSpace
                               , typename PolicyBase::work_tag
                               , typename PolicyBase::index_type
                               , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
                               >;
 };

@ -91,6 +95,7 @@ struct SetSchedule
                               , typename PolicyBase::work_tag
                               , typename PolicyBase::index_type
                               , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
                               >;
 };

@ -104,6 +109,7 @@ struct SetWorkTag
                               , WorkTag
                               , typename PolicyBase::index_type
                               , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
                               >;
 };

@ -117,6 +123,7 @@ struct SetIndexType
                               , typename PolicyBase::work_tag
                               , IndexType
                               , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
                               >;
 };

@ -131,6 +138,22 @@ struct SetIterationPattern
                               , typename PolicyBase::work_tag
                               , typename PolicyBase::index_type
                               , IterationPattern
+                               , typename PolicyBase::launch_bounds
+                               >;
+};
+
+
+template <typename PolicyBase, typename LaunchBounds>
+struct SetLaunchBounds
+{
+  static_assert( is_void<typename PolicyBase::launch_bounds>::value
+               , "Kokkos Error: More than one launch_bounds given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               , LaunchBounds
                               >;
 };

@ -146,8 +169,9 @@ struct AnalyzePolicy<Base, T, Traits...> : public
    , typename std::conditional< is_index_type<T>::value       , SetIndexType<Base,T>
    , typename std::conditional< std::is_integral<T>::value    , SetIndexType<Base, IndexType<T> >
    , typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
+    , typename std::conditional< is_launch_bounds<T>::value    , SetLaunchBounds<Base,T>
    , SetWorkTag<Base,T>
-    >::type >::type >::type >::type>::type::type
+    >::type >::type >::type >::type >::type>::type::type
  , Traits...
  >
 {};
@ -178,11 +202,18 @@ struct AnalyzePolicy<Base>
                                                     , void // TODO set default iteration pattern
                                                     , typename Base::iteration_pattern
                                                     >::type;
+
+  using launch_bounds = typename std::conditional< is_void< typename Base::launch_bounds >::value
+                                                     , LaunchBounds<>
+                                                     , typename Base::launch_bounds
+                                                     >::type;
+
  using type = PolicyTraitsBase< execution_space
                               , schedule_type
                               , work_tag
                               , index_type
                               , iteration_pattern
+                               , launch_bounds
                               >;
 };

--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@ -41,6 +41,10 @@
 //@HEADER
 */

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
@ -126,11 +130,21 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,

 inline
 int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
-{ return __sync_val_compare_and_swap(dest,compare,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_val_compare_and_swap(dest,compare,val);
+}

 inline
 long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
-{ return __sync_val_compare_and_swap(dest,compare,val); }
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_val_compare_and_swap(dest,compare,val);
+}

 #if defined( KOKKOS_ENABLE_GNU_ATOMICS )

@ -159,6 +173,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
    KOKKOS_INLINE_FUNCTION U() {};
  } tmp ;

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
  return tmp.t ;
 }
@ -175,6 +193,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
    KOKKOS_INLINE_FUNCTION U() {};
  } tmp ;

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
  return tmp.t ;
 }
@ -193,6 +215,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
    KOKKOS_INLINE_FUNCTION U() {};
  } tmp ;

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
  return tmp.t ;
 }
@ -209,6 +235,10 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
            #endif
             , const T >::type& val )
 {
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  while( !Impl::lock_address_host_space( (void*) dest ) );
  T return_val = *dest;
  if( return_val == compare ) {
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
@ -41,6 +41,10 @@
 //@HEADER
 */

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP )
 #define KOKKOS_ATOMIC_DECREMENT_HPP
@ -54,6 +58,10 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_decrement<char>(volatile char* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+
  __asm__ __volatile__(
      "lock decb %0"
      : /* no output registers */
@ -69,6 +77,10 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_decrement<short>(volatile short* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+
  __asm__ __volatile__(
      "lock decw %0"
      : /* no output registers */
@ -84,6 +96,10 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_decrement<int>(volatile int* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+
  __asm__ __volatile__(
      "lock decl %0"
      : /* no output registers */
@ -99,6 +115,9 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_decrement<long long int>(volatile long long int* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
  __asm__ __volatile__(
      "lock decq %0"
      : /* no output registers */
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@ -41,6 +41,10 @@
 //@HEADER
 */

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
 #define KOKKOS_ATOMIC_EXCHANGE_HPP
@ -81,6 +85,10 @@ T atomic_exchange(
  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
 {
  // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
  return *((T*)&tmp);
 }
@ -93,6 +101,11 @@ T atomic_exchange(
                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
 {
  typedef unsigned long long int type ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
  type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
  return *((T*)&tmp);
@ -108,6 +121,10 @@ T atomic_exchange( volatile T * const dest ,
 {
  T return_val;
  // This is a way to (hopefully) avoid dead lock in a warp
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  int done = 0;
  unsigned int active = __ballot(1);
  unsigned int done_active = 0;
@ -173,6 +190,9 @@ T atomic_exchange( volatile T * const dest ,
                                  , const T & >::type val )
 {
  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif

  const type v = *((type*)&val); // Extract to be sure the value doesn't change

@ -201,6 +221,10 @@ T atomic_exchange( volatile T * const dest ,
  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
                                  , const T & >::type val )
 {
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  union U {
    Impl::cas128_t i ;
    T t ;
@ -260,6 +284,10 @@ void atomic_assign( volatile T * const dest ,
 {
  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  const type v = *((type*)&val); // Extract to be sure the value doesn't change

  type assumed ;
@ -285,6 +313,10 @@ void atomic_assign( volatile T * const dest ,
  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
                                  , const T & >::type val )
 {
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  union U {
    Impl::cas128_t i ;
    T t ;
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@ -41,6 +41,10 @@
 //@HEADER
 */

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
 #define KOKKOS_ATOMIC_FETCH_ADD_HPP
@ -161,36 +165,60 @@ T atomic_fetch_add( volatile T * const dest ,
 inline
 int atomic_fetch_add( volatile int * dest , const int val )
 {
-        int original = val;
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) 
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif

-        __asm__ __volatile__(
-                "lock xadd %1, %0"
-                : "+m" (*dest), "+r" (original)
-                : "m" (*dest), "r" (original)
-                : "memory"
+  int original = val;
+
+  __asm__ __volatile__(
+  	"lock xadd %1, %0"
+        : "+m" (*dest), "+r" (original)
+        : "m" (*dest), "r" (original)
+        : "memory"
        );

-        return original;
+  return original;
 }
 #else
 inline
 int atomic_fetch_add( volatile int * const dest , const int val )
-{ return __sync_fetch_and_add(dest, val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest, val);
+}
 #endif

 inline
 long int atomic_fetch_add( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_add(dest,val); }
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest,val);
+}

 #if defined( KOKKOS_ENABLE_GNU_ATOMICS )

 inline
 unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_add(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest,val);
+}

 inline
 unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_add(dest,val); }
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest,val);
+}

 #endif

@ -205,6 +233,10 @@ T atomic_fetch_add( volatile T * const dest ,
    inline U() {};
  } assume , oldval , newval ;

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  oldval.t = *dest ;

  do {
@ -228,6 +260,10 @@ T atomic_fetch_add( volatile T * const dest ,
    inline U() {};
  } assume , oldval , newval ;

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  oldval.t = *dest ;

  do {
@ -253,6 +289,10 @@ T atomic_fetch_add( volatile T * const dest ,
    inline U() {};
  } assume , oldval , newval ;

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  oldval.t = *dest ;

  do {
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
@ -41,6 +41,10 @@
 //@HEADER
 */

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
 #define KOKKOS_ATOMIC_FETCH_AND_HPP
@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_and( volatile unsigned long long int * const

 inline
 int atomic_fetch_and( volatile int * const dest , const int val )
-{ return __sync_fetch_and_and(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}

 inline
 long int atomic_fetch_and( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_and(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}

 #if defined( KOKKOS_ENABLE_GNU_ATOMICS )

 inline
 unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_and(dest,val); }
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}

 inline
 unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_and(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}

 #endif

--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
@ -41,6 +41,10 @@
 //@HEADER
 */

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
 #define KOKKOS_ATOMIC_FETCH_OR_HPP
@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_or( volatile unsigned long long int * const

 inline
 int atomic_fetch_or( volatile int * const dest , const int val )
-{ return __sync_fetch_and_or(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}

 inline
 long int atomic_fetch_or( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_or(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}

 #if defined( KOKKOS_ENABLE_GNU_ATOMICS )

 inline
 unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_or(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}

 inline
 unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_or(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}

 #endif

--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@ -41,6 +41,10 @@
 //@HEADER
 */

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
 #define KOKKOS_ATOMIC_FETCH_SUB_HPP
@ -136,21 +140,41 @@ T atomic_fetch_sub( volatile T * const dest ,

 inline
 int atomic_fetch_sub( volatile int * const dest , const int val )
-{ return __sync_fetch_and_sub(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}

 inline
 long int atomic_fetch_sub( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_sub(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}

 #if defined( KOKKOS_ENABLE_GNU_ATOMICS )

 inline
 unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_sub(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}

 inline
 unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_sub(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}

 #endif

@ -161,6 +185,10 @@ T atomic_fetch_sub( volatile T * const dest ,
 {
  union { int i ; T t ; } assume , oldval , newval ;

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  oldval.t = *dest ;

  do {
@ -178,6 +206,10 @@ T atomic_fetch_sub( volatile T * const dest ,
  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
                                    sizeof(T) == sizeof(long) , const T >::type val )
 {
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  union { long i ; T t ; } assume , oldval , newval ;

  oldval.t = *dest ;
@ -202,6 +234,10 @@ T atomic_fetch_sub( volatile T * const dest ,
               && ( sizeof(T) != 8 )
             , const T >::type& val )
 {
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
  while( !Impl::lock_address_host_space( (void*) dest ) );
  T return_val = *dest;
  *dest = return_val - val;
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
@ -41,6 +41,10 @@
 //@HEADER
 */

+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP )
 #define KOKKOS_ATOMIC_INCREMENT_HPP
@ -52,6 +56,9 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_increment<char>(volatile char* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
  __asm__ __volatile__(
      "lock incb %0"
      : /* no output registers */
@ -67,6 +74,9 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_increment<short>(volatile short* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
  __asm__ __volatile__(
      "lock incw %0"
      : /* no output registers */
@ -82,6 +92,9 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_increment<int>(volatile int* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
  __asm__ __volatile__(
      "lock incl %0"
      : /* no output registers */
@ -97,6 +110,9 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_increment<long long int>(volatile long long int* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
  __asm__ __volatile__(
      "lock incq %0"
      : /* no output registers */
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -87,17 +87,12 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
 #if defined( KOKKOS_ENABLE_OPENMP )
  if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
      std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
-    if(num_threads>0) {
-      if(use_numa>0) {
-        Kokkos::OpenMP::initialize(num_threads,use_numa);
-      }
-      else {
-        Kokkos::OpenMP::initialize(num_threads);
-      }
-    } else {
-      Kokkos::OpenMP::initialize();
+    if(use_numa>0) {
+      Kokkos::OpenMP::initialize(num_threads,use_numa);
+    }
+    else {
+      Kokkos::OpenMP::initialize(num_threads);
    }
-    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
  }
  else {
    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
@ -437,10 +432,7 @@ void initialize(int& narg, char* arg[])
      iarg++;
    }

-    InitArguments arguments;
-    arguments.num_threads = num_threads;
-    arguments.num_numa = numa;
-    arguments.device_id = device;
+    InitArguments arguments{num_threads, numa, device};
    Impl::initialize_internal(arguments);
 }

--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@ -170,28 +170,31 @@ struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType:
  static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) ,
    "Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" );

+  /* this cast to bool is needed for correctness by NVCC */
+  enum : bool { IsArray = static_cast<bool>(Impl::is_array< typename FunctorType::value_type >::value) };
+
  // If not an array then what is the sizeof(value_type)
-  enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) };
+  enum { StaticValueSize = IsArray ? 0 : sizeof(value_type) };

  typedef value_type                 * pointer_type ;

  // The reference_type for an array is 'value_type *'
  // The reference_type for a single value is 'value_type &'

-  typedef typename Impl::if_c< ! StaticValueSize , value_type *
-                                                 , value_type & >::type  reference_type ;
+  typedef typename Impl::if_c< IsArray , value_type *
+                                       , value_type & >::type  reference_type ;

  // Number of values if single value
  template< class F >
  KOKKOS_FORCEINLINE_FUNCTION static
-  typename Impl::enable_if< std::is_same<F,FunctorType>::value && StaticValueSize , unsigned >::type
+  typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! IsArray , unsigned >::type
    value_count( const F & ) { return 1 ; }

  // Number of values if an array, protect via templating because 'f.value_count'
  // will only exist when the functor declares the value_type to be an array.
  template< class F >
  KOKKOS_FORCEINLINE_FUNCTION static
-  typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! StaticValueSize , unsigned >::type
+  typename Impl::enable_if< std::is_same<F,FunctorType>::value && IsArray , unsigned >::type
    value_count( const F & f ) { return f.value_count ; }

  // Total size of the value
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@ -70,62 +70,6 @@
 #ifdef KOKKOS_ENABLE_HBWSPACE
 #define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)

-namespace Kokkos {
-namespace Experimental {
-namespace {
-
-static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
-
-typedef int (* QuerySpaceInParallelPtr )();
-
-QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
-int s_in_parallel_query_count = 0 ;
-
-} // namespace <empty>
-
-void HBWSpace::register_in_parallel( int (*device_in_parallel)() )
-{
-  if ( 0 == device_in_parallel ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel ERROR : given NULL" ) );
-  }
-
-  int i = -1 ;
-
-  if ( ! (device_in_parallel)() ) {
-    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
-  }
-
-  if ( i < s_in_parallel_query_count ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : called in_parallel" ) );
-
-  }
-
-  if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
-
-  }
-
-  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
-
-  if ( i == s_in_parallel_query_count ) {
-    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
-  }
-}
-
-int HBWSpace::in_parallel()
-{
-  const int n = s_in_parallel_query_count ;
-
-  int i = 0 ;
-
-  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
-
-  return i < n ;
-}
-
-} // namespace Experiemtal
-} // namespace Kokkos
-
 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@ -106,62 +106,6 @@
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

-namespace Kokkos {
-namespace {
-
-static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
-
-typedef int (* QuerySpaceInParallelPtr )();
-
-QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
-int s_in_parallel_query_count = 0 ;
-
-} // namespace <empty>
-
-void HostSpace::register_in_parallel( int (*device_in_parallel)() )
-{
-  if ( 0 == device_in_parallel ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
-  }
-
-  int i = -1 ;
-
-  if ( ! (device_in_parallel)() ) {
-    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
-  }
-
-  if ( i < s_in_parallel_query_count ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
-
-  }
-
-  if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
-
-  }
-
-  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
-
-  if ( i == s_in_parallel_query_count ) {
-    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
-  }
-}
-
-int HostSpace::in_parallel()
-{
-  const int n = s_in_parallel_query_count ;
-
-  int i = 0 ;
-
-  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
-
-  return i < n ;
-}
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
 namespace Kokkos {

 /* Default allocation mechanism */
@ -340,9 +284,6 @@ void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_
  }
 }

-constexpr const char* HostSpace::name() {
-  return m_name;
-}
 } // namespace Kokkos

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@ -45,7 +45,7 @@
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_Spinwait.hpp>

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -58,9 +58,11 @@ void HostThreadTeamData::organize_pool
 {
  bool ok = true ;

+  memory_fence();
+
  // Verify not already a member of a pool:
  for ( int rank = 0 ; rank < size && ok ; ++rank ) {
-    ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
+    ok = ( nullptr != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
  }

  if ( ok ) {
@ -89,7 +91,6 @@ void HostThreadTeamData::organize_pool
        mem->m_team_alloc   = 1 ;
        mem->m_league_rank  = rank ;
        mem->m_league_size  = size ;
-        mem->m_pool_rendezvous_step = 0 ;
        mem->m_team_rendezvous_step = 0 ;
        pool[ rank ] = mem ;
      }
@ -116,7 +117,6 @@ void HostThreadTeamData::disband_pool()
   m_team_alloc   = 1 ;
   m_league_rank  = 0 ;
   m_league_size  = 1 ;
-   m_pool_rendezvous_step = 0 ;
   m_team_rendezvous_step = 0 ;
 }

@ -256,11 +256,6 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer

  const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;

-  union {
-    int64_t full ;
-    int8_t  byte[8] ;
-  } value ;
-
  if ( rank ) {

    const int group_begin = rank << shift_byte ; // == rank * size_byte
@ -275,13 +270,14 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
      const int end = group_begin + size_byte < size
                    ? size_byte : size - group_begin ;

-      value.full = 0 ;
-      for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+      int64_t value = 0 ;

-      store_fence(); // This should not be needed but fixes #742
+      for ( int i = 0 ; i < end ; ++i ) {
+        ((int8_t*) & value )[i] = int8_t( step );
+      }

      spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
-                          , value.full );
+                          , value );
    }

    {
@ -316,10 +312,12 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer

    const int end = size_byte < size ? 8 : size ;

-    value.full = 0 ;
-    for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+    int64_t value = 0 ;
+    for ( int i = 1 ; i < end ; ++i ) {
+      ((int8_t *) & value)[i] = int8_t( step );
+    }

-    spinwait_until_equal( buffer[ sync_offset ], value.full );
+    spinwait_until_equal( buffer[ sync_offset ], value );
  }

  return rank ? 0 : 1 ;
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@ -50,6 +50,7 @@
 #include <Kokkos_ExecPolicy.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
+#include <impl/Kokkos_Rendezvous.hpp>

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -67,14 +68,12 @@ public:

  // Assume upper bounds on number of threads:
  //   pool size       <= 1024 threads
-  //   pool rendezvous <= ( 1024 / 8 ) * 4 + 4 = 2052
  //   team size       <= 64 threads
-  //   team rendezvous <= ( 64 / 8 ) * 4 + 4 = 36

  enum : int { max_pool_members  = 1024 };
  enum : int { max_team_members  = 64 };
-  enum : int { max_pool_rendezvous  = ( max_pool_members / 8 ) * 4 + 4 };
-  enum : int { max_team_rendezvous  = ( max_team_members / 8 ) * 4 + 4 };
+  enum : int { max_pool_rendezvous = rendezvous_buffer_size( max_pool_members ) };
+  enum : int { max_team_rendezvous = rendezvous_buffer_size( max_team_members ) };

 private:

@ -114,7 +113,6 @@ private:
  int         m_league_size ;
  int         m_work_chunk ;
  int         m_steal_rank ; // work stealing rank
-  int mutable m_pool_rendezvous_step ;
  int mutable m_team_rendezvous_step ;

  HostThreadTeamData * team_member( int r ) const noexcept
@ -147,6 +145,7 @@ public:
  int team_rendezvous( int const root ) const noexcept
    {
      return 1 == m_team_size ? 1 :
+             HostThreadTeamData::
             rendezvous( m_team_scratch + m_team_rendezvous
                       , m_team_rendezvous_step
                       , m_team_size
@ -157,6 +156,7 @@ public:
  int team_rendezvous() const noexcept
    {
      return 1 == m_team_size ? 1 :
+             HostThreadTeamData::
             rendezvous( m_team_scratch + m_team_rendezvous
                       , m_team_rendezvous_step
                       , m_team_size
@ -167,6 +167,7 @@ public:
  void team_rendezvous_release() const noexcept
    {
      if ( 1 < m_team_size ) {
+        HostThreadTeamData::
        rendezvous_release( m_team_scratch + m_team_rendezvous
                          , m_team_rendezvous_step );
      }
@ -175,19 +176,30 @@ public:
  inline
  int pool_rendezvous() const noexcept
    {
+      static constexpr int yield_wait =
+        #if defined( KOKKOS_COMPILER_IBM )
+            // If running on IBM POWER architecture the global
+            // level rendzvous should immediately yield when
+            // waiting for other threads in the pool to arrive.
+          1
+        #else
+          0
+        #endif
+          ;
      return 1 == m_pool_size ? 1 :
+             Kokkos::Impl::
             rendezvous( m_pool_scratch + m_pool_rendezvous
-                       , m_pool_rendezvous_step
                       , m_pool_size
-                       , m_pool_rank );
+                       , m_pool_rank
+                       , yield_wait );
    }

  inline
  void pool_rendezvous_release() const noexcept
    {
      if ( 1 < m_pool_size ) {
-        rendezvous_release( m_pool_scratch + m_pool_rendezvous
-                          , m_pool_rendezvous_step );
+        Kokkos::Impl::
+        rendezvous_release( m_pool_scratch + m_pool_rendezvous );
      }
    }

@ -213,7 +225,6 @@ public:
    , m_league_size(1)
    , m_work_chunk(0)
    , m_steal_rank(0)
-    , m_pool_rendezvous_step(0)
    , m_team_rendezvous_step(0)
    {}

@ -406,7 +417,7 @@ fflush(stdout);
      // Steal from next team, round robin
      // The next team is offset by m_team_alloc if it fits in the pool.

-      m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ? 
+      m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ?
                     m_team_base + m_team_alloc : 0 ;
    }

--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@ -50,51 +50,70 @@
 namespace Kokkos {
 namespace Profiling {

+static initFunction initProfileLibrary = nullptr;
+static finalizeFunction finalizeProfileLibrary = nullptr;
+
+static beginFunction beginForCallee = nullptr;
+static beginFunction beginScanCallee = nullptr;
+static beginFunction beginReduceCallee = nullptr;
+static endFunction endForCallee = nullptr;
+static endFunction endScanCallee = nullptr;
+static endFunction endReduceCallee = nullptr;
+
+static pushFunction pushRegionCallee = nullptr;
+static popFunction popRegionCallee = nullptr;
+
+static allocateDataFunction allocateDataCallee = nullptr;
+static deallocateDataFunction deallocateDataCallee = nullptr;
+
+static beginDeepCopyFunction beginDeepCopyCallee = nullptr;
+static endDeepCopyFunction endDeepCopyCallee = nullptr;
+
 SpaceHandle::SpaceHandle(const char* space_name) {
  strncpy(name,space_name,64);
 }

 bool profileLibraryLoaded() {
-  return (NULL != initProfileLibrary);
+  return (nullptr != initProfileLibrary);
 }

 void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
-  if(NULL != beginForCallee) {
+  if(nullptr != beginForCallee) {
    Kokkos::fence();
    (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
  }
 }

 void endParallelFor(const uint64_t kernelID) {
-  if(NULL != endForCallee) {
+  if(nullptr != endForCallee) {
    Kokkos::fence();
    (*endForCallee)(kernelID);
  }
 }

 void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
-  if(NULL != beginScanCallee) {
+  if(nullptr != beginScanCallee) {
    Kokkos::fence();
    (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
  }
 }

 void endParallelScan(const uint64_t kernelID) {
-  if(NULL != endScanCallee) {
+  if(nullptr != endScanCallee) {
    Kokkos::fence();
    (*endScanCallee)(kernelID);
  }
 }

 void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
-  if(NULL != beginReduceCallee) {
+  if(nullptr != beginReduceCallee) {
    Kokkos::fence();
    (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
  }
 }

 void endParallelReduce(const uint64_t kernelID) {
-  if(NULL != endReduceCallee) {
+  if(nullptr != endReduceCallee) {
    Kokkos::fence();
    (*endReduceCallee)(kernelID);
  }
@ -102,31 +121,47 @@ void endParallelReduce(const uint64_t kernelID) {


 void pushRegion(const std::string& kName) {
-  if( NULL != pushRegionCallee ) {
+  if( nullptr != pushRegionCallee ) {
    Kokkos::fence();
    (*pushRegionCallee)(kName.c_str());
  }
 }

 void popRegion() {
-  if( NULL != popRegionCallee ) {
+  if( nullptr != popRegionCallee ) {
    Kokkos::fence();
    (*popRegionCallee)();
  }
 }

 void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
-  if(NULL != allocateDataCallee) {
+  if(nullptr != allocateDataCallee) {
    (*allocateDataCallee)(space,label.c_str(),ptr,size);
  }
 }

 void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
-  if(NULL != allocateDataCallee) {
+  if(nullptr != deallocateDataCallee) {
    (*deallocateDataCallee)(space,label.c_str(),ptr,size);
  }
 }

+void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr,
+    const SpaceHandle src_space, const std::string src_label, const void* src_ptr,
+    const uint64_t size) {
+  if(nullptr != beginDeepCopyCallee) {
+    (*beginDeepCopyCallee)(dst_space, dst_label.c_str(), dst_ptr,
+                      src_space, src_label.c_str(), src_ptr,
+                      size);
+  }
+}
+
+void endDeepCopy() {
+  if(nullptr != endDeepCopyCallee) {
+    (*endDeepCopyCallee)();
+  }
+}
+
 void initialize() {

  // Make sure initialize calls happens only once
@ -140,7 +175,7 @@ void initialize() {

  // If we do not find a profiling library in the environment then exit
  // early.
-  if( NULL == envProfileLibrary ) {
+  if( nullptr == envProfileLibrary ) {
    return ;
  }

@ -149,10 +184,10 @@ void initialize() {

  char* profileLibraryName = strtok(envProfileCopy, ";");

-  if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
+  if( (nullptr != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
    firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);

-    if(NULL == firstProfileLibrary) {
+    if(nullptr == firstProfileLibrary) {
      std::cerr << "Error: Unable to load KokkosP library: " <<
        profileLibraryName << std::endl;
    } else {
@ -191,14 +226,19 @@ void initialize() {
      auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
      deallocateDataCallee = *((deallocateDataFunction*) &p12);

+      auto p13 = dlsym(firstProfileLibrary, "kokkosp_begin_deep_copy");
+      beginDeepCopyCallee = *((beginDeepCopyFunction*) &p13);
+      auto p14 = dlsym(firstProfileLibrary, "kokkosp_end_deep_copy");
+      endDeepCopyCallee = *((endDeepCopyFunction*) &p14);
+
    }
  }

-  if(NULL != initProfileLibrary) {
+  if(nullptr != initProfileLibrary) {
    (*initProfileLibrary)(0,
        (uint64_t) KOKKOSP_INTERFACE_VERSION,
        (uint32_t) 0,
-        NULL);
+        nullptr);
  }

  free(envProfileCopy);
@ -210,28 +250,30 @@ void finalize() {
  if(is_finalized) return;
  is_finalized = 1;

-  if(NULL != finalizeProfileLibrary) {
+  if(nullptr != finalizeProfileLibrary) {
    (*finalizeProfileLibrary)();

-    // Set all profile hooks to NULL to prevent
+    // Set all profile hooks to nullptr to prevent
    // any additional calls. Once we are told to
    // finalize, we mean it
-    initProfileLibrary = NULL;
-    finalizeProfileLibrary = NULL;
+    initProfileLibrary = nullptr;
+    finalizeProfileLibrary = nullptr;

-    beginForCallee = NULL;
-    beginScanCallee = NULL;
-    beginReduceCallee = NULL;
-    endScanCallee = NULL;
-    endForCallee = NULL;
-    endReduceCallee = NULL;
+    beginForCallee = nullptr;
+    beginScanCallee = nullptr;
+    beginReduceCallee = nullptr;
+    endScanCallee = nullptr;
+    endForCallee = nullptr;
+    endReduceCallee = nullptr;

-    pushRegionCallee = NULL;
-    popRegionCallee = NULL;
+    pushRegionCallee = nullptr;
+    popRegionCallee = nullptr;

-    allocateDataCallee = NULL;
-    deallocateDataCallee = NULL;
+    allocateDataCallee = nullptr;
+    deallocateDataCallee = nullptr;

+    beginDeepCopyCallee = nullptr;
+    endDeepCopyCallee = nullptr;
  }
 }
 }
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@ -81,23 +81,11 @@ typedef void (*popFunction)();
 typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
 typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);

-
-static initFunction initProfileLibrary = NULL;
-static finalizeFunction finalizeProfileLibrary = NULL;
-
-static beginFunction beginForCallee = NULL;
-static beginFunction beginScanCallee = NULL;
-static beginFunction beginReduceCallee = NULL;
-static endFunction endForCallee = NULL;
-static endFunction endScanCallee = NULL;
-static endFunction endReduceCallee = NULL;
-
-static pushFunction pushRegionCallee = NULL;
-static popFunction popRegionCallee = NULL;
-
-static allocateDataFunction allocateDataCallee = NULL;
-static deallocateDataFunction deallocateDataCallee = NULL;
-
+typedef void (*beginDeepCopyFunction)(
+    SpaceHandle, const char*, const void*,
+    SpaceHandle, const char*, const void*,
+    uint64_t);
+typedef void (*endDeepCopyFunction)();

 bool profileLibraryLoaded();

@ -114,35 +102,14 @@ void popRegion();
 void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
 void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);

+void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr,
+    const SpaceHandle src_space, const std::string src_label, const void* src_ptr,
+    const uint64_t size);
+void endDeepCopy();
+
 void initialize();
 void finalize();

-//Define finalize_fake inline to get rid of warnings for unused static variables
-inline void finalize_fake() {
-  if(NULL != finalizeProfileLibrary) {
-    (*finalizeProfileLibrary)();
-
-    // Set all profile hooks to NULL to prevent
-    // any additional calls. Once we are told to
-    // finalize, we mean it
-    beginForCallee = NULL;
-    beginScanCallee = NULL;
-    beginReduceCallee = NULL;
-    endScanCallee = NULL;
-    endForCallee = NULL;
-    endReduceCallee = NULL;
-
-    allocateDataCallee = NULL;
-    deallocateDataCallee = NULL;
-
-    initProfileLibrary = NULL;
-    finalizeProfileLibrary = NULL;
-    pushRegionCallee = NULL;
-    popRegionCallee = NULL;
-  }
-}
-
-
 }
 }

--- a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp
@ -0,0 +1,208 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Rendezvous.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
+
+namespace Kokkos { namespace Impl {
+
+//----------------------------------------------------------------------------
+/* pattern for rendezvous
+ *
+ *  if ( rendezvous() ) {
+ *     ... all other threads are still in team_rendezvous() ...
+ *     rendezvous_release();
+ *     ... all other threads are released from team_rendezvous() ...
+ *  }
+ */
+
+int rendezvous( volatile int64_t * const buffer
+              , int const size
+              , int const rank
+              , int const slow
+              ) noexcept
+{
+  enum : int { shift_byte = 3 };
+  enum : int { size_byte  = ( 01 << shift_byte ) }; // == 8
+  enum : int { mask_byte  = size_byte - 1 };
+
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+
+  // Cycle step values: 1 <= step <= size_val_cycle
+  // An odd multiple of memory cycle so that when a memory location
+  // is reused it has a different value.
+  // Must be representable within a single byte: size_val_cycle < 16
+
+  enum : int { size_val_cycle = 3 * size_mem_cycle };
+
+  // Requires:
+  //   Called by rank = [ 0 .. size )
+  //   buffer aligned to int64_t[4]
+
+  // A sequence of rendezvous uses four cycled locations in memory
+  // and non-equal cycled synchronization values to
+  // 1) prevent rendezvous from overtaking one another and
+  // 2) give each spin wait location an int64_t[4] span
+  //    so that it has its own cache line.
+
+  const int64_t step = (buffer[0] % size_val_cycle ) + 1 ;
+
+  // The leading int64_t[4] span is for thread 0 to write
+  // and all other threads to read spin-wait.
+  // sync_offset is the index into this array for this step.
+
+  const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle + size_mem_cycle ;
+
+  if ( rank ) {
+
+    const int group_begin = rank << shift_byte ; // == rank * size_byte
+
+    if ( group_begin < size ) {
+
+      //  This thread waits for threads
+      //   [ group_begin .. group_begin + 8 )
+      //   [ rank*8      .. rank*8 + 8      )
+      // to write to their designated bytes.
+
+      const int end = group_begin + size_byte < size
+                    ? size_byte : size - group_begin ;
+
+      int64_t value = 0;
+      for ( int i = 0 ; i < end ; ++i ) {
+        value |= step << (i * size_byte );
+      }
+
+      store_fence(); // This should not be needed but fixes #742
+
+      if ( slow ) {
+        yield_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
+                          , value );
+      }
+      else {
+        spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
+                            , value );
+      }
+    }
+
+    {
+      // This thread sets its designated byte.
+      //   ( rank % size_byte ) +
+      //   ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
+      //   ( sync_offset * size_byte )
+      const int offset = ( rank & mask_byte )
+                       + ( ( rank & ~mask_byte ) << shift_mem_cycle )
+                       + ( sync_offset << shift_byte );
+
+      // All of this thread's previous memory stores must be complete before
+      // this thread stores the step value at this thread's designated byte
+      // in the shared synchronization array.
+
+      Kokkos::memory_fence();
+
+      ((volatile int8_t*) buffer)[ offset ] = int8_t( step );
+
+      // Memory fence to push the previous store out
+      Kokkos::memory_fence();
+    }
+
+    // Wait for thread 0 to release all other threads
+
+    if ( slow ) {
+      yield_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) );
+    }
+    else {
+      spinwait_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) );
+    }
+  }
+  else {
+    // Thread 0 waits for threads [1..7]
+    // to write to their designated bytes.
+
+    const int end = size_byte < size ? 8 : size ;
+
+    int64_t value = 0;
+    for ( int i = 1 ; i < end ; ++i ) {
+      value |= step << (i * size_byte );
+    }
+
+    if ( slow ) {
+      yield_until_equal( buffer[ sync_offset ], value );
+    }
+    else {
+      spinwait_until_equal( buffer[ sync_offset ], value );
+    }
+  }
+
+  return rank ? 0 : 1 ;
+}
+
+void rendezvous_release( volatile int64_t * const buffer ) noexcept
+{
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+  enum : int { size_val_cycle = 3 * size_mem_cycle };
+
+  // Requires:
+  //   Called after team_rendezvous
+  //   Called only by true == team_rendezvous(root)
+
+  // update step
+  const int64_t step = (buffer[0] % size_val_cycle ) + 1;
+  buffer[0] = step;
+
+  // Memory fence to be sure all previous writes are complete:
+  Kokkos::memory_fence();
+
+  buffer[ (step & mask_mem_cycle) + size_mem_cycle ] = step;
+
+  // Memory fence to push the store out
+  Kokkos::memory_fence();
+}
+
+}} // namespace Kokkos::Impl
+
--- a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp
@ -0,0 +1,87 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_RENDEZVOUS_HPP
+#define KOKKOS_IMPL_RENDEZVOUS_HPP
+
+#include <cstdint>
+
+namespace Kokkos { namespace Impl {
+
+inline
+constexpr int rendezvous_buffer_size( int max_members ) noexcept
+{
+  return (((max_members + 7) / 8) * 4) + 4 + 4;
+}
+
+/** \brief  Thread pool rendezvous
+ *
+ *  Rendezvous pattern:
+ *   if ( rendezvous(root) ) {
+ *     ... only root thread here while all others wait ...
+ *     rendezvous_release();
+ *   }
+ *   else {
+ *     ... all other threads release here ...
+ *   }
+ *
+ *  Requires: buffer[ rendezvous_buffer_size( max_threads ) ];
+ *
+ *  When slow != 0 the expectation is thread arrival will be 
+ *  slow so the threads that arrive early should quickly yield
+ *  their core to the runtime thus possibly allowing the late
+ *  arriving threads to have more resources
+ *  (e.g., power and clock frequency).
+ */
+int rendezvous( volatile int64_t * const buffer
+              , int const size
+              , int const rank
+              , int const slow = 0 ) noexcept ;
+
+void rendezvous_release( volatile int64_t * const buffer ) noexcept ;
+
+
+}} // namespace Kokkos::Impl
+
+#endif // KOKKOS_IMPL_RENDEZVOUS_HPP
+
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@ -50,6 +50,7 @@
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>

+#include <impl/Kokkos_SharedAlloc.hpp>

 /*--------------------------------------------------------------------------*/

@ -123,7 +124,6 @@ void serial_resize_thread_team_data( size_t pool_reduce_bytes
  }
 }

-// Get thread team data structure for omp_get_thread_num()
 HostThreadTeamData * serial_get_thread_team_data()
 {
  return & g_serial_thread_team_data ;
@ -151,6 +151,8 @@ void Serial::initialize( unsigned threads_count
  (void) use_cores_per_numa;
  (void) allow_asynchronous_threadpool;

+  Impl::SharedAllocationRecord< void, void >::tracking_enable();
+
  // Init the array of locks used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();
  #if defined(KOKKOS_ENABLE_PROFILING)
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
@ -62,7 +62,7 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
 {
  using execution_space = Kokkos::Serial ;
  using queue_type      = TaskQueue< execution_space > ;
-  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using task_root_type  = TaskBase< void , void , void > ;
  using Member          = Impl::HostThreadTeamMember< execution_space > ;

  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
@ -122,7 +122,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
 {
  using execution_space = Kokkos::Serial ;
  using queue_type      = TaskQueue< execution_space > ;
-  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using task_root_type  = TaskBase< void , void , void > ;
  using Member          = Impl::HostThreadTeamMember< execution_space > ;

  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@ -65,7 +65,7 @@ public:
  using execution_space = Kokkos::Serial ;
  using memory_space    = Kokkos::HostSpace ;
  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
-  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< void , void , void > ;
  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;

  static
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
@ -0,0 +1,102 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP
+#define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::Serial
+                 >
+  : public Kokkos::Impl::Experimental::
+           WorkGraphExec< FunctorType,
+                          Kokkos::Serial,
+                          Traits ...
+                        >
+{
+private:
+
+  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
+  typedef Kokkos::Impl::Experimental::
+          WorkGraphExec<FunctorType, Kokkos::Serial, Traits ... > Base ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    Base::m_functor( i );
+  }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    const TagType t{} ;
+    Base::m_functor( t , i );
+  }
+
+public:
+
+  inline
+  void execute()
+  {
+    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
+      exec_one< typename Policy::work_tag >( i );
+      Base::after_work(i);
+    }
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : Base( arg_functor, arg_policy )
+  {
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP */
--- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@ -46,23 +46,23 @@
 namespace Kokkos {
 namespace Impl {

-int SharedAllocationRecord< void , void >::s_tracking_enabled = 1 ;
+namespace {

-void SharedAllocationRecord< void , void >::tracking_claim_and_disable()
-{
-  // A host thread claim and disable tracking flag
+__thread int t_tracking_enabled = 1;

-  while ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 1, 0 ) );
 }

-void SharedAllocationRecord< void , void >::tracking_release_and_enable()
-{
-  // The host thread that claimed and disabled the tracking flag
-  // now release and enable tracking.
+int SharedAllocationRecord< void , void >::tracking_enabled()
+{ return t_tracking_enabled; }

-  if ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 0, 1 ) ){
-    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord<>::tracking_release_and_enable FAILED, this host process thread did not hold the lock" );
-  }
+void SharedAllocationRecord< void , void >::tracking_disable()
+{
+  t_tracking_enabled = 0;
+}
+
+void SharedAllocationRecord< void , void >::tracking_enable()
+{
+  t_tracking_enabled = 1;
 }

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
@ -71,6 +71,9 @@ public:
  KOKKOS_INLINE_FUNCTION static
  const SharedAllocationHeader * get_header( void * alloc_ptr )
    { return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); }
+
+  KOKKOS_INLINE_FUNCTION
+  const char* label() const { return m_label; }
 };

 template<>
@ -83,8 +86,6 @@ protected:

  typedef void (* function_type )( SharedAllocationRecord<void,void> * );

-  static int s_tracking_enabled ;
-
  SharedAllocationHeader * const m_alloc_ptr ;
  size_t                   const m_alloc_size ;
  function_type            const m_dealloc ;
@ -110,17 +111,17 @@ protected:
 public:
  inline std::string get_label() const { return std::string("Unmanaged"); }

-  static int tracking_enabled() { return s_tracking_enabled ; }
+  static int tracking_enabled();

  /**\brief A host process thread claims and disables the
   *        shared allocation tracking flag.
   */
-  static void tracking_claim_and_disable();
+  static void tracking_disable();

  /**\brief A host process thread releases and enables the
   *        shared allocation tracking flag.
   */
-  static void tracking_release_and_enable();
+  static void tracking_enable();

  ~SharedAllocationRecord() = default ;

@ -317,6 +318,11 @@ public:
 #endif
    }

+  KOKKOS_INLINE_FUNCTION
+  bool has_record() const {
+    return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0;
+  }
+
  KOKKOS_FORCEINLINE_FUNCTION
  ~SharedAllocationTracker()
    { KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT }
--- a/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp
@ -0,0 +1,210 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+
+#if defined( KOKKOS_ENABLE_STDTHREAD )
+  #include <thread>
+#elif !defined( _WIN32 )
+  #include <sched.h>
+  #include <time.h>
+#else
+  #include <process.h>
+  #include <winsock2.h>
+  #include <windows.h>
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+void host_thread_yield( const uint32_t i , const int force_yield )
+{
+  static constexpr uint32_t sleep_limit = 1 << 13 ;
+  static constexpr uint32_t yield_limit = 1 << 12 ;
+
+  const int c = Kokkos::Impl::bit_scan_reverse(i);
+
+  if ( sleep_limit < i ) {
+
+    // Attempt to put the thread to sleep for 'c' milliseconds
+
+    #if defined( KOKKOS_ENABLE_STDTHREAD )
+      std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) )
+    #elif !defined( _WIN32 )
+      timespec req ;
+      req.tv_sec  = 0 ;
+      req.tv_nsec = 1000 * c ;
+      nanosleep( &req, nullptr );
+    #else /* defined( _WIN32 ) IS Microsoft Windows */
+      Sleep(c);
+    #endif
+  }
+
+  else if ( force_yield || yield_limit < i ) {
+
+    // Attempt to yield thread resources to runtime
+
+    #if defined( KOKKOS_ENABLE_STDTHREAD )
+      std::this_thread::yield();
+    #elif !defined( _WIN32 )
+      sched_yield();
+    #else /* defined( _WIN32 ) IS Microsoft Windows */
+      YieldProcessor();
+    #endif
+  }
+
+  #if defined( KOKKOS_ENABLE_ASM )
+
+  else if ( (1u<<4) < i ) {
+
+    // Insert a few no-ops to quiet the thread:
+
+    for ( int k = 0 ; k < c ; ++k ) {
+      #if defined( __amd64 ) || defined( __amd64__ ) || \
+       	    defined( __x86_64 ) || defined( __x86_64__ )
+    	#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
+          asm volatile( "nop\n" );
+	#else
+          __asm__ __volatile__( "nop\n" );
+        #endif
+      #elif defined(__PPC64__)
+          asm volatile( "nop\n" );
+      #endif
+    }
+  }
+
+  {
+    // Insert memory pause
+      #if defined( __amd64 ) || defined( __amd64__ ) || \
+       	    defined( __x86_64 ) || defined( __x86_64__ )
+    	#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
+          asm volatile( "pause\n":::"memory" );
+	#else
+          __asm__ __volatile__( "pause\n":::"memory" );
+        #endif
+      #elif defined(__PPC64__)
+	asm volatile( "or 27, 27, 27" ::: "memory" );
+      #endif
+  }
+
+  #endif /* defined( KOKKOS_ENABLE_ASM ) */
+}
+
+}}} // namespace Kokkos::Impl::{anonymous}
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
+  Kokkos::load_fence();
+}
+
+void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
+  Kokkos::load_fence();
+}
+
+void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
+  Kokkos::load_fence();
+}
+
+void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
+  Kokkos::load_fence();
+}
+
+void yield_while_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
+  Kokkos::load_fence();
+}
+
+void yield_until_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
+  Kokkos::load_fence();
+}
+
+void yield_while_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
+  Kokkos::load_fence();
+}
+
+void yield_until_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
+  Kokkos::load_fence();
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#else
+void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
+#endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp
@ -59,6 +59,13 @@ void spinwait_until_equal( volatile int32_t & flag , const int32_t value );

 void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
 void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
+
+void yield_while_equal( volatile int32_t & flag , const int32_t value );
+void yield_until_equal( volatile int32_t & flag , const int32_t value );
+
+void yield_while_equal( volatile int64_t & flag , const int64_t value );
+void yield_until_equal( volatile int64_t & flag , const int64_t value );
+
 #else

 KOKKOS_INLINE_FUNCTION
@ -71,6 +78,16 @@ void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
 KOKKOS_INLINE_FUNCTION
 void spinwait_until_equal( volatile int64_t & , const int64_t ) {}

+KOKKOS_INLINE_FUNCTION
+void yield_while_equal( volatile int32_t & , const int32_t ) {}
+KOKKOS_INLINE_FUNCTION
+void yield_until_equal( volatile int32_t & , const int32_t ) {}
+
+KOKKOS_INLINE_FUNCTION
+void yield_while_equal( volatile int64_t & , const int64_t ) {}
+KOKKOS_INLINE_FUNCTION
+void yield_until_equal( volatile int64_t & , const int64_t ) {}
+
 #endif

 } /* namespace Impl */
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@ -59,24 +59,15 @@
 namespace Kokkos {
 namespace Impl {

-/*\brief  Implementation data for task data management, access, and execution.
- *
- *  Curiously recurring template pattern (CRTP)
- *  to allow static_cast from the
- *  task root type and a task's FunctorType.
- *
- *    TaskBase< Space , ResultType , FunctorType >
- *      : TaskBase< Space , ResultType , void >
- *      , FunctorType
- *      { ... };
- *
- *    TaskBase< Space , ResultType , void >
- *      : TaskBase< Space , void , void >
- *      { ... };
- */
-template< typename Space , typename ResultType , typename FunctorType >
+template< class Space , typename ResultType , class FunctorType >
 class TaskBase ;

+template< typename Space >
+class TaskQueue ;
+
+template< typename Space >
+class TaskQueueSpecialization ;
+
 } /* namespace Impl */
 } /* namespace Kokkos */

@ -86,8 +77,217 @@ class TaskBase ;
 namespace Kokkos {
 namespace Impl {

-template< typename Space >
-class TaskQueueSpecialization ;
+/** \brief  Base class for task management, access, and execution.
+ *
+ *  Inheritance structure to allow static_cast from the task root type
+ *  and a task's FunctorType.
+ *
+ *    // Enable a functor to access the base class
+ *    // and provide memory for result value.
+ *    TaskBase< Space , ResultType , FunctorType >
+ *      : TaskBase< void , void , void >
+ *      , FunctorType
+ *      { ... };
+ *    Followed by memory allocated for result value.
+ *
+ *
+ *  States of a task:
+ *
+ *    Constructing State, NOT IN a linked list
+ *      m_wait == 0
+ *      m_next == 0
+ *
+ *    Scheduling transition : Constructing -> Waiting
+ *      before:
+ *        m_wait == 0
+ *        m_next == this task's initial dependence, 0 if none
+ *      after:
+ *        m_wait == EndTag
+ *        m_next == EndTag
+ *
+ *    Waiting State, IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == next of linked list of tasks
+ *
+ *    transition : Waiting -> Executing
+ *      before:
+ *        m_next == EndTag
+ *      after::
+ *        m_next == LockTag
+ *
+ *    Executing State, NOT IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == LockTag
+ *
+ *    Respawn transition : Executing -> Executing-Respawn
+ *      before:
+ *        m_next == LockTag
+ *      after:
+ *        m_next == this task's updated dependence, 0 if none
+ *
+ *    Executing-Respawn State, NOT IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == this task's updated dependence, 0 if none
+ *
+ *    transition : Executing -> Complete
+ *      before:
+ *        m_wait == head of linked list
+ *      after:
+ *        m_wait == LockTag
+ *
+ *    Complete State, NOT IN a linked list
+ *      m_wait == LockTag: cannot add dependence (<=> complete)
+ *      m_next == LockTag: not a member of a wait queue
+ *
+ */
+template<>
+class TaskBase< void , void , void >
+{
+public:
+
+  enum : int16_t   { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
+  enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
+
+  template< typename > friend class Kokkos::TaskScheduler ;
+
+  typedef TaskQueue< void > queue_type ;
+
+  typedef void (* function_type) ( TaskBase * , void * );
+
+  // sizeof(TaskBase) == 48
+
+  function_type  m_apply ;       ///< Apply function pointer
+  queue_type   * m_queue ;       ///< Pointer to queue
+  TaskBase     * m_wait ;        ///< Linked list of tasks waiting on this
+  TaskBase     * m_next ;        ///< Waiting linked-list next
+  int32_t        m_ref_count ;   ///< Reference count
+  int32_t        m_alloc_size ;  ///< Allocation size
+  int32_t        m_dep_count ;   ///< Aggregate's number of dependences
+  int16_t        m_task_type ;   ///< Type of task
+  int16_t        m_priority ;    ///< Priority of runnable task
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  TaskBase()
+    : m_apply(      0 )
+    , m_queue(      0 )
+    , m_wait(       0 )
+    , m_next(       0 )
+    , m_ref_count(  0 )
+    , m_alloc_size( 0 )
+    , m_dep_count(  0 )
+    , m_task_type(  0 )
+    , m_priority(   0 )
+    {}
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase * volatile * aggregate_dependences() volatile
+    { return reinterpret_cast<TaskBase*volatile*>( this + 1 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  bool requested_respawn()
+    {
+      // This should only be called when a task has finished executing and is
+      // in the transition to either the complete or executing-respawn state.
+      TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag );
+      return lock != m_next;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskBase* dep )
+    {
+      // Precondition: lock == m_next
+
+      TaskBase * const lock = (TaskBase *) LockTag ;
+
+      // Assign dependence to m_next.  It will be processed in the subsequent
+      // call to schedule.  Error if the dependence is reset.
+      if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
+        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
+      }
+
+      if ( 0 != dep ) {
+        // The future may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+        Kokkos::atomic_increment( &(dep->m_ref_count) );
+      }
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int32_t reference_count() const
+    { return *((int32_t volatile *)( & m_ref_count )); }
+
+};
+
+static_assert( sizeof(TaskBase<void,void,void>) == 48
+             , "Verifying expected sizeof(TaskBase<void,void,void>)" );
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< typename ResultType >
+struct TaskResult {
+
+  enum : int32_t { size = sizeof(ResultType) };
+
+  using reference_type = ResultType & ;
+
+  KOKKOS_INLINE_FUNCTION static
+  ResultType * ptr( TaskBase<void,void,void> * task )
+    {
+      return reinterpret_cast< ResultType * >
+        ( reinterpret_cast< char * >(task) + task->m_alloc_size - sizeof(ResultType) );
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type get( TaskBase<void,void,void> * task )
+    { return *ptr( task ); }
+};
+
+template<>
+struct TaskResult< void > {
+
+  enum : int32_t { size = 0 };
+
+  using reference_type = void ;
+
+  KOKKOS_INLINE_FUNCTION static
+  void * ptr( TaskBase<void,void,void> * ) { return (void*) 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type get( TaskBase<void,void,void> * ) {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueue< void > {};

 /** \brief  Manage task allocation, deallocation, and scheduling.
 *
@ -95,7 +295,7 @@ class TaskQueueSpecialization ;
 *  All other aspects of task management have shared implementation.
 */
 template< typename ExecSpace >
-class TaskQueue {
+class TaskQueue : public TaskQueue<void> {
 private:

  friend class TaskQueueSpecialization< ExecSpace > ;
@ -106,7 +306,7 @@ private:
  using memory_space    = typename specialization::memory_space ;
  using device_type     = Kokkos::Device< execution_space , memory_space > ;
  using memory_pool     = Kokkos::MemoryPool< device_type > ;
-  using task_root_type  = Kokkos::Impl::TaskBase<execution_space,void,void> ;
+  using task_root_type  = Kokkos::Impl::TaskBase<void,void,void> ;

  struct Destroy {
    TaskQueue * m_queue ;
@ -198,12 +398,10 @@ public:
    }

  // Assign task pointer with reference counting of assigned tasks
-  template< typename LV , typename RV >
  KOKKOS_FUNCTION static
-  void assign( TaskBase< execution_space,LV,void> ** const lhs
-             , TaskBase< execution_space,RV,void> *  const rhs )
+  void assign( task_root_type ** const lhs
+             , task_root_type *  const rhs )
    {
-      using task_lhs = TaskBase< execution_space,LV,void> ;
 #if 0
  {
    printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
@ -225,7 +423,7 @@ public:

      // Force write of *lhs

-      *static_cast< task_lhs * volatile * >(lhs) = rhs ;
+      *static_cast< task_root_type * volatile * >(lhs) = rhs ;

      Kokkos::memory_fence();
    }
@ -238,6 +436,38 @@ public:

  KOKKOS_FUNCTION
  void deallocate( void * p , size_t n ); ///< Deallocate to the memory pool
+
+
+  //----------------------------------------
+  /**\brief  Allocation size for a spawned task */
+
+  template< typename FunctorType >
+  KOKKOS_FUNCTION
+  size_t spawn_allocation_size() const
+    {
+      using value_type = typename FunctorType::value_type ;
+
+      using task_type = Impl::TaskBase< execution_space
+                                      , value_type
+                                      , FunctorType > ;
+
+      enum : size_t { align = ( 1 << 4 ) , align_mask = align - 1 };
+      enum : size_t { task_size   = sizeof(task_type) };
+      enum : size_t { result_size = Impl::TaskResult< value_type >::size };
+      enum : size_t { alloc_size =
+        ( ( task_size   + align_mask ) & ~align_mask ) +
+        ( ( result_size + align_mask ) & ~align_mask ) };
+
+      return m_memory.allocate_block_size( task_size );
+    }
+
+  /**\brief  Allocation size for a when_all aggregate */
+
+  KOKKOS_FUNCTION
+  size_t when_all_allocation_size( int narg ) const
+    {
+      return m_memory.allocate_block_size( sizeof(task_root_type) + narg * sizeof(task_root_type*) );
+    }
 };

 } /* namespace Impl */
@ -249,261 +479,9 @@ public:
 namespace Kokkos {
 namespace Impl {

-template<>
-class TaskBase< void , void , void > {
-public:
-  enum : int16_t   { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
-  enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
-};
-
-/** \brief  Base class for task management, access, and execution.
- *
- *  Inheritance structure to allow static_cast from the task root type
- *  and a task's FunctorType.
- *
- *    // Enable a Future to access result data
- *    TaskBase< Space , ResultType , void >
- *      : TaskBase< void , void , void >
- *      { ... };
- *
- *    // Enable a functor to access the base class
- *    TaskBase< Space , ResultType , FunctorType >
- *      : TaskBase< Space , ResultType , void >
- *      , FunctorType
- *      { ... };
- *
- *
- *  States of a task:
- *
- *    Constructing State, NOT IN a linked list
- *      m_wait == 0
- *      m_next == 0
- *
- *    Scheduling transition : Constructing -> Waiting
- *      before:
- *        m_wait == 0
- *        m_next == this task's initial dependence, 0 if none
- *      after:
- *        m_wait == EndTag
- *        m_next == EndTag
- *
- *    Waiting State, IN a linked list
- *      m_apply != 0
- *      m_queue != 0
- *      m_ref_count > 0
- *      m_wait == head of linked list of tasks waiting on this task
- *      m_next == next of linked list of tasks
- *
- *    transition : Waiting -> Executing
- *      before:
- *        m_next == EndTag
- *      after::
- *        m_next == LockTag
- *
- *    Executing State, NOT IN a linked list
- *      m_apply != 0
- *      m_queue != 0
- *      m_ref_count > 0
- *      m_wait == head of linked list of tasks waiting on this task
- *      m_next == LockTag
- *
- *    Respawn transition : Executing -> Executing-Respawn
- *      before:
- *        m_next == LockTag
- *      after:
- *        m_next == this task's updated dependence, 0 if none
- *
- *    Executing-Respawn State, NOT IN a linked list
- *      m_apply != 0
- *      m_queue != 0
- *      m_ref_count > 0
- *      m_wait == head of linked list of tasks waiting on this task
- *      m_next == this task's updated dependence, 0 if none
- *
- *    transition : Executing -> Complete
- *      before:
- *        m_wait == head of linked list
- *      after:
- *        m_wait == LockTag
- *
- *    Complete State, NOT IN a linked list
- *      m_wait == LockTag: cannot add dependence
- *      m_next == LockTag: not a member of a wait queue
- *
- */
-template< typename ExecSpace >
-class TaskBase< ExecSpace , void , void >
-{
-public:
-
-  enum : int16_t   { TaskTeam   = TaskBase<void,void,void>::TaskTeam
-                   , TaskSingle = TaskBase<void,void,void>::TaskSingle
-                   , Aggregate  = TaskBase<void,void,void>::Aggregate };
-
-  enum : uintptr_t { LockTag = TaskBase<void,void,void>::LockTag
-                   , EndTag  = TaskBase<void,void,void>::EndTag };
-
-  using execution_space = ExecSpace ;
-  using queue_type      = TaskQueue< execution_space > ;
-
-  template< typename > friend class Kokkos::TaskScheduler ;
-
-  typedef void (* function_type) ( TaskBase * , void * );
-
-  // sizeof(TaskBase) == 48
-
-  function_type  m_apply ;       ///< Apply function pointer
-  queue_type   * m_queue ;       ///< Queue in which this task resides
-  TaskBase     * m_wait ;        ///< Linked list of tasks waiting on this
-  TaskBase     * m_next ;        ///< Waiting linked-list next
-  int32_t        m_ref_count ;   ///< Reference count
-  int32_t        m_alloc_size ;  ///< Allocation size
-  int32_t        m_dep_count ;   ///< Aggregate's number of dependences
-  int16_t        m_task_type ;   ///< Type of task
-  int16_t        m_priority ;    ///< Priority of runnable task
-
-  TaskBase() = delete ;
-  TaskBase( TaskBase && ) = delete ;
-  TaskBase( const TaskBase & ) = delete ;
-  TaskBase & operator = ( TaskBase && ) = delete ;
-  TaskBase & operator = ( const TaskBase & ) = delete ;
-
-  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
-
-  // Constructor for a runnable task
-  KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase( function_type arg_apply
-                    , queue_type  * arg_queue
-                    , TaskBase    * arg_dependence
-                    , int           arg_ref_count
-                    , int           arg_alloc_size
-                    , int           arg_task_type
-                    , int           arg_priority
-                    ) noexcept
-    : m_apply(      arg_apply )
-    , m_queue(      arg_queue )
-    , m_wait( 0 )
-    , m_next(       arg_dependence )
-    , m_ref_count(  arg_ref_count )
-    , m_alloc_size( arg_alloc_size )
-    , m_dep_count( 0 )
-    , m_task_type(  arg_task_type )
-    , m_priority(   arg_priority )
-    {}
-
-  // Constructor for an aggregate task
-  KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase( queue_type  * arg_queue
-                    , int           arg_ref_count
-                    , int           arg_alloc_size
-                    , int           arg_dep_count
-                    ) noexcept
-    : m_apply( 0 )
-    , m_queue( arg_queue )
-    , m_wait( 0 )
-    , m_next( 0 )
-    , m_ref_count(  arg_ref_count )
-    , m_alloc_size( arg_alloc_size )
-    , m_dep_count(  arg_dep_count )
-    , m_task_type(  Aggregate )
-    , m_priority( 0 )
-    {}
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  TaskBase ** aggregate_dependences()
-    { return reinterpret_cast<TaskBase**>( this + 1 ); }
-
-  KOKKOS_INLINE_FUNCTION
-  bool requested_respawn()
-    {
-      // This should only be called when a task has finished executing and is
-      // in the transition to either the complete or executing-respawn state.
-      TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag );
-      return lock != m_next;
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void add_dependence( TaskBase* dep )
-    {
-      // Precondition: lock == m_next
-
-      TaskBase * const lock = (TaskBase *) LockTag ;
-
-      // Assign dependence to m_next.  It will be processed in the subsequent
-      // call to schedule.  Error if the dependence is reset.
-      if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
-        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
-      }
-
-      if ( 0 != dep ) {
-        // The future may be destroyed upon returning from this call
-        // so increment reference count to track this assignment.
-        Kokkos::atomic_increment( &(dep->m_ref_count) );
-      }
-    }
-
-  using get_return_type = void ;
-
-  KOKKOS_INLINE_FUNCTION
-  get_return_type get() const {}
-};
-
-template < typename ExecSpace , typename ResultType >
-class TaskBase< ExecSpace , ResultType , void >
-  : public TaskBase< ExecSpace , void , void >
-{
-private:
-
-  using root_type     = TaskBase<ExecSpace,void,void> ;
-  using function_type = typename root_type::function_type ;
-  using queue_type    = typename root_type::queue_type ;
-
-  static_assert( sizeof(root_type) == 48 , "" );
-
-  TaskBase() = delete ;
-  TaskBase( TaskBase && ) = delete ;
-  TaskBase( const TaskBase & ) = delete ;
-  TaskBase & operator = ( TaskBase && ) = delete ;
-  TaskBase & operator = ( const TaskBase & ) = delete ;
-
-public:
-
-  ResultType   m_result ;
-
-  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
-
-  // Constructor for runnable task
-  KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase( function_type arg_apply
-                    , queue_type  * arg_queue
-                    , root_type   * arg_dependence
-                    , int           arg_ref_count
-                    , int           arg_alloc_size
-                    , int           arg_task_type
-                    , int           arg_priority
-                    )
-    : root_type( arg_apply
-               , arg_queue
-               , arg_dependence
-               , arg_ref_count
-               , arg_alloc_size
-               , arg_task_type
-               , arg_priority
-               )
-    , m_result()
-    {}
-
-  using get_return_type = ResultType const & ;
-
-  KOKKOS_INLINE_FUNCTION
-  get_return_type get() const { return m_result ; }
-};
-
-template< typename ExecSpace , typename ResultType , typename FunctorType >
+template< class ExecSpace , typename ResultType , class FunctorType >
 class TaskBase
-  : public TaskBase< ExecSpace , ResultType , void >
+  : public TaskBase< void , void , void >
  , public FunctorType
 {
 private:
@ -516,50 +494,31 @@ private:

 public:

-  using root_type       = TaskBase< ExecSpace , void , void > ;
-  using base_type       = TaskBase< ExecSpace , ResultType , void > ;
-  using specialization  = TaskQueueSpecialization< ExecSpace > ;
-  using function_type   = typename root_type::function_type ;
-  using queue_type      = typename root_type::queue_type ;
-  using member_type     = typename specialization::member_type ;
+  using root_type       = TaskBase< void , void , void > ;
  using functor_type    = FunctorType ;
  using result_type     = ResultType ;

-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION static
-  void apply_functor
-    ( Type * const task
-    , typename std::enable_if
-        < std::is_same< typename Type::result_type , void >::value
-        , member_type * const
-        >::type member
-    )
-    {
-      using fType = typename Type::functor_type ;
-      static_cast<fType*>(task)->operator()( *member );
-    }
+  using specialization  = TaskQueueSpecialization< ExecSpace > ;
+  using member_type     = typename specialization::member_type ;

-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION static
-  void apply_functor
-    ( Type * const task
-    , typename std::enable_if
-        < ! std::is_same< typename Type::result_type , void >::value
-        , member_type * const
-        >::type member
-    )
-    {
-      using fType = typename Type::functor_type ;
-      static_cast<fType*>(task)->operator()( *member , task->m_result );
-    }
+  KOKKOS_INLINE_FUNCTION
+  void apply_functor( member_type * const member , void * )
+    { functor_type::operator()( *member ); }
+
+  template< typename T >
+  KOKKOS_INLINE_FUNCTION
+  void apply_functor( member_type * const member
+                    , T           * const result )
+    { functor_type::operator()( *member , *result ); }

  KOKKOS_FUNCTION static
  void apply( root_type * root , void * exec )
    {
      TaskBase    * const task   = static_cast< TaskBase * >( root );
      member_type * const member = reinterpret_cast< member_type * >( exec );
+      result_type * const result = TaskResult< result_type >::ptr( task );

-      TaskBase::template apply_functor( task , member );
+      task->apply_functor( member , result );

      // Task may be serial or team.
      // If team then must synchronize before querying if respawn was requested.
@ -576,26 +535,9 @@ public:
    }

  // Constructor for runnable task
-  KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase( function_type arg_apply
-                    , queue_type  * arg_queue
-                    , root_type   * arg_dependence
-                    , int           arg_ref_count
-                    , int           arg_alloc_size
-                    , int           arg_task_type
-                    , int           arg_priority
-                    , FunctorType && arg_functor
-                    )
-    : base_type( arg_apply
-               , arg_queue
-               , arg_dependence
-               , arg_ref_count
-               , arg_alloc_size
-               , arg_task_type
-               , arg_priority
-               )
-    , functor_type( arg_functor )
-    {}
+  KOKKOS_INLINE_FUNCTION constexpr
+  TaskBase( FunctorType && arg_functor )
+    : root_type() , functor_type( std::move(arg_functor) ) {}

  KOKKOS_INLINE_FUNCTION
  ~TaskBase() {}
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@ -44,6 +44,8 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_TASKDAG )

+#define KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING 0
+
 namespace Kokkos {
 namespace Impl {

@ -100,9 +102,11 @@ KOKKOS_FUNCTION
 void TaskQueue< ExecSpace >::decrement
  ( TaskQueue< ExecSpace >::task_root_type * task )
 {
-  const int count = Kokkos::atomic_fetch_add(&(task->m_ref_count),-1);
+  task_root_type volatile & t = *task ;

-#if 0
+  const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count),-1);
+
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
  if ( 1 == count ) {
    printf( "decrement-destroy( 0x%lx { 0x%lx %d %d } )\n"
          , uintptr_t( task )
@ -114,9 +118,13 @@ void TaskQueue< ExecSpace >::decrement
 #endif

  if ( ( 1 == count ) &&
-       ( task->m_next == (task_root_type *) task_root_type::LockTag ) ) {
+       ( t.m_next == (task_root_type *) task_root_type::LockTag ) ) {
    // Reference count is zero and task is complete, deallocate.
-    task->m_queue->deallocate( task , task->m_alloc_size );
+
+    TaskQueue< ExecSpace > * const queue =
+      static_cast< TaskQueue< ExecSpace > * >( t.m_queue );
+
+    queue->deallocate( task , t.m_alloc_size );
  }
  else if ( count <= 1 ) {
    Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
@ -171,7 +179,7 @@ bool TaskQueue< ExecSpace >::push_task
  // Fail the push attempt if the queue is locked;
  // otherwise retry until the push succeeds.

-#if 0
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
  printf( "push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
        , uintptr_t(queue)
        , uintptr_t(*queue)
@ -186,9 +194,9 @@ bool TaskQueue< ExecSpace >::push_task
  task_root_type * const zero = (task_root_type *) 0 ;
  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;

-  task_root_type * volatile * const next = & task->m_next ;
+  task_root_type * volatile & next = task->m_next ;

-  if ( zero != *next ) {
+  if ( zero != next ) {
    Kokkos::abort("TaskQueue::push_task ERROR: already a member of another queue" );
  }

@ -196,9 +204,9 @@ bool TaskQueue< ExecSpace >::push_task

  while ( lock != y ) {

-    *next = y ;
+    next = y ;

-    // Do not proceed until '*next' has been stored.
+    // Do not proceed until 'next' has been stored.
    Kokkos::memory_fence();

    task_root_type * const x = y ;
@ -211,9 +219,9 @@ bool TaskQueue< ExecSpace >::push_task
  // Failed, replace 'task->m_next' value since 'task' remains
  // not a member of a queue.

-  *next = zero ;
+  next = zero ;

-  // Do not proceed until '*next' has been stored.
+  // Do not proceed until 'next' has been stored.
  Kokkos::memory_fence();

  return false ;
@ -270,11 +278,13 @@ TaskQueue< ExecSpace >::pop_ready_task
      // This thread has exclusive access to
      // the queue and the popped task's m_next.

-      *queue = task->m_next ; task->m_next = lock ;
+      task_root_type * volatile & next = task->m_next ;
+
+      *queue = next ; next = lock ;

      Kokkos::memory_fence();

-#if 0
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
      printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
            , uintptr_t(queue)
            , uintptr_t(task)
@ -323,7 +333,7 @@ void TaskQueue< ExecSpace >::schedule_runnable
  //     task->m_wait == head of linked list (queue)
  //     task->m_next == member of linked list (queue)

-#if 0
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
  printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
        , uintptr_t(task)
        , uintptr_t(task->m_wait)
@ -337,20 +347,22 @@ void TaskQueue< ExecSpace >::schedule_runnable
  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;

+  task_root_type volatile & t = *task ;
+
  bool respawn = false ;

  //----------------------------------------

-  if ( zero == task->m_wait ) {
+  if ( zero == t.m_wait ) {
    // Task in Constructing state
    // - Transition to Waiting state
    // Preconditions:
    // - call occurs exclusively within a single thread

-    task->m_wait = end ;
+    t.m_wait = end ;
    // Task in Waiting state
  }
-  else if ( lock != task->m_wait ) {
+  else if ( lock != t.m_wait ) {
    // Task in Executing state with Respawn request
    // - Update dependence
    // - Transition to Waiting state
@ -373,7 +385,9 @@ void TaskQueue< ExecSpace >::schedule_runnable

  // Exclusive access so don't need an atomic exchange
  // task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
-  task_root_type * dep = task->m_next ; task->m_next = zero ;
+  task_root_type * dep = t.m_next ; t.m_next = zero ;
+
+  Kokkos::memory_fence();

  const bool is_ready =
    ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
@ -398,7 +412,7 @@ void TaskQueue< ExecSpace >::schedule_runnable
    Kokkos::atomic_increment( & m_ready_count );

    task_root_type * volatile * const ready_queue =
-      & m_ready[ task->m_priority ][ task->m_task_type ];
+      & m_ready[ t.m_priority ][ t.m_task_type ];

    // A push_task fails if the ready queue is locked.
    // A ready queue is only locked during a push or pop;
@ -441,7 +455,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate
  //     task->m_wait == head of linked list (queue)
  //     task->m_next == member of linked list (queue)

-#if 0
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
  printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
        , uintptr_t(task)
        , uintptr_t(task->m_wait)
@ -455,18 +469,20 @@ void TaskQueue< ExecSpace >::schedule_aggregate
  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;

+  task_root_type volatile & t = *task ;
+
  //----------------------------------------

-  if ( zero == task->m_wait ) {
+  if ( zero == t.m_wait ) {
    // Task in Constructing state
    // - Transition to Waiting state
    // Preconditions:
    // - call occurs exclusively within a single thread

-    task->m_wait = end ;
+    t.m_wait = end ;
    // Task in Waiting state
  }
-  else if ( lock == task->m_wait ) {
+  else if ( lock == t.m_wait ) {
    // Task in Complete state
    Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
  }
@ -477,14 +493,14 @@ void TaskQueue< ExecSpace >::schedule_aggregate
  // (1) created or
  // (2) being removed from a completed task's wait list.

-  task_root_type ** const aggr = task->aggregate_dependences();
+  task_root_type * volatile * const aggr = t.aggregate_dependences();

  // Assume the 'when_all' is complete until a dependence is
  // found that is not complete.

  bool is_complete = true ;

-  for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
+  for ( int i = t.m_dep_count ; 0 < i && is_complete ; ) {

    --i ;

@ -523,7 +539,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate
    // Complete the when_all 'task' to schedule other tasks
    // that are waiting for the when_all 'task' to complete.

-    task->m_next = lock ;
+    t.m_next = lock ;

    complete( task );

@ -573,7 +589,7 @@ void TaskQueue< ExecSpace >::complete
  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;

-#if 0
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
  printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
        , uintptr_t(task)
        , uintptr_t(task->m_wait)
@ -584,11 +600,13 @@ void TaskQueue< ExecSpace >::complete
  fflush( stdout );
 #endif

-  const bool runnable = task_root_type::Aggregate != task->m_task_type ;
+  task_root_type volatile & t = *task ;
+
+  const bool runnable = task_root_type::Aggregate != t.m_task_type ;

  //----------------------------------------

-  if ( runnable && lock != task->m_next ) {
+  if ( runnable && lock != t.m_next ) {
    // Is a runnable task has finished executing and requested respawn.
    // Schedule the task for subsequent execution.

@ -607,7 +625,7 @@ void TaskQueue< ExecSpace >::complete
    // Stop other tasks from adding themselves to this task's wait queue
    // by locking the head of this task's wait queue.

-    task_root_type * x = Kokkos::atomic_exchange( & task->m_wait , lock );
+    task_root_type * x = Kokkos::atomic_exchange( & t.m_wait , lock );

    if ( x != (task_root_type *) lock ) {

@ -627,9 +645,13 @@ void TaskQueue< ExecSpace >::complete
        // Have exclusive access to 'x' until it is scheduled
        // Set x->m_next = zero  <=  no dependence, not a respawn

-        task_root_type * const next = x->m_next ; x->m_next = 0 ;
+        task_root_type volatile & vx = *x ;

-        if ( task_root_type::Aggregate != x->m_task_type ) {
+        task_root_type * const next = vx.m_next ; vx.m_next = 0 ;
+
+        Kokkos::memory_fence();
+
+        if ( task_root_type::Aggregate != vx.m_task_type ) {
          schedule_runnable( x );
        }
        else {
--- a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -47,7 +47,6 @@
 #include <Kokkos_Array.hpp>

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

 template< class DataType , class ArrayLayout , class V , size_t N , class P >
@ -94,13 +93,12 @@ public:
  typedef typename ViewDataType< non_const_scalar_type , array_scalar_dimension >::type  non_const_scalar_array_type ;
 };

-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

 /** \brief  View mapping for non-specialized data type and standard layout */
@ -597,7 +595,7 @@ public:
    }
 };

-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@ -96,6 +96,27 @@ struct is_view_label< const char[N] > : public std::true_type {};
 template< typename ... P >
 struct ViewCtorProp ;

+// Forward declare
+template< typename Specialize , typename T >
+struct CommonViewAllocProp ;
+
+/* Common value_type stored as ViewCtorProp
+ */
+template< typename Specialize , typename T >
+struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  using type = CommonViewAllocProp<Specialize,T> ;
+
+  ViewCtorProp( const type & arg ) : value( arg ) {}
+  ViewCtorProp( type && arg ) : value( arg ) {}
+
+  type value ;
+};
+
 /*  std::integral_constant<unsigned,I> are dummy arguments
 *  that avoid duplicate base class errors
 */
--- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@ -62,7 +62,6 @@
 //----------------------------------------------------------------------------

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

 template< unsigned I , size_t ... Args >
@ -250,7 +249,7 @@ struct ViewDimensionAssignable< ViewDimension< DstArgs ... >

 };

-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -266,14 +265,11 @@ struct ALL_t {
 }} // namespace Kokkos::Impl

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

-using Kokkos::Impl::ALL_t ;
-
 template< class T >
 struct is_integral_extent_type
-{ enum { value = std::is_same<T,Kokkos::Experimental::Impl::ALL_t>::value ? 1 : 0 }; };
+{ enum { value = std::is_same<T,Kokkos::Impl::ALL_t>::value ? 1 : 0 }; };

 template< class iType >
 struct is_integral_extent_type< std::pair<iType,iType> >
@ -314,10 +310,10 @@ struct SubviewLegalArgsCompileTime;

 template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs>
 struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> {
-  enum { value      =(((CurrentArg==RankDest-1) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) ||
+  enum { value      =(((CurrentArg==RankDest-1) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
                      ((CurrentArg>=RankDest) && (std::is_integral<Arg>::value)) ||
                      ((CurrentArg<RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value)) ||
-                      ((CurrentArg==0) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value))
+                      ((CurrentArg==0) && (Kokkos::Impl::is_integral_extent_type<Arg>::value))
                     ) && (SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)};
 };

@ -331,7 +327,7 @@ struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankD

 template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs>
 struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> {
-  enum { value      =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) ||
+  enum { value      =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
                      ((CurrentArg<RankSrc-RankDest) && (std::is_integral<Arg>::value)) ||
                      ((CurrentArg>=RankSrc-RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value))
                     ) && (SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)};
@ -403,7 +399,7 @@ private:
  bool set( unsigned domain_rank
          , unsigned range_rank
          , const ViewDimension< DimArgs ... > & dim
-          , const Kokkos::Experimental::Impl::ALL_t
+          , const Kokkos::Impl::ALL_t
          , Args ... args )
    {
      m_begin[  domain_rank ] = 0 ;
@ -519,7 +515,7 @@ private:
            , unsigned domain_rank
            , unsigned range_rank
            , const ViewDimension< DimArgs ... > & dim
-            , const Kokkos::Experimental::Impl::ALL_t
+            , const Kokkos::Impl::ALL_t
            , Args ... args ) const
    {
      const int n = std::min( buf_len ,
@ -670,13 +666,12 @@ public:
    { return unsigned(i) < InternalRangeRank ? m_index[i] : ~0u ; }
 };

-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

 /** \brief  Given a value type and dimension generate the View data type */
@ -814,13 +809,12 @@ public:
  typedef non_const_type  non_const_scalar_array_type ;
 };

-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

 template < class Dimension , class Layout , typename Enable = void >
@ -1228,14 +1222,14 @@ private:

    // If memory alignment is a multiple of the trivial scalar size then attempt to align.
    enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
-    enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
+    enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr

    KOKKOS_INLINE_FUNCTION
    static constexpr size_t stride( size_t const N )
-      {
-        return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
-               ? N + align - ( N % div_ok ) : N ;
-      }
+    {
+      return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) )
+             ? N + align - ( N % div_ok ) : N ;
+    }
  };

 public:
@ -1707,12 +1701,12 @@ private:

    // If memory alignment is a multiple of the trivial scalar size then attempt to align.
    enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
-    enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
+    enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr

    KOKKOS_INLINE_FUNCTION
    static constexpr size_t stride( size_t const N )
    {
-      return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
+      return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) )
             ? N + align - ( N % div_ok ) : N ;
    }
  };
@ -2225,13 +2219,12 @@ public:
    {}
 };

-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

 /** \brief  ViewDataHandle provides the type of the 'data handle' which the view
@ -2422,13 +2415,12 @@ struct ViewDataHandle< Traits ,
    return handle_type( arg_data_ptr + offset );
  }
 };
-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

 //----------------------------------------------------------------------------
@ -2451,8 +2443,9 @@ template< class ExecSpace , class ValueType >
 struct ViewValueFunctor< ExecSpace , ValueType , false /* is_scalar */ >
 {
  typedef Kokkos::RangePolicy< ExecSpace > PolicyType ;
+  typedef typename ExecSpace::execution_space Exec;

-  ExecSpace   space ;
+  Exec        space ;
  ValueType * ptr ;
  size_t      n ;
  bool        destroy ;
@ -2597,6 +2590,9 @@ private:

 public:

+  typedef void printable_label_typedef;
+  enum { is_managed = Traits::is_managed };
+
  //----------------------------------------
  // Domain dimensions

@ -2944,7 +2940,7 @@ public:
          Kokkos::abort("View Assignment: trying to assign runtime dimension to non matching compile time dimension.");
      }
      dst.m_offset = dst_offset_type( src.m_offset );
-      dst.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
+      dst.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
    }
 };

@ -3102,7 +3098,7 @@ public:

 //----------------------------------------------------------------------------

-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -3151,6 +3147,77 @@ void view_error_operator_bounds
  view_error_operator_bounds<R+1>(buf+n,len-n,map,args...);
 }

+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+/* Check #3: is the View managed as determined by the MemoryTraits? */
+template< class MapType,
+  bool is_managed = (MapType::is_managed != 0) >
+struct OperatorBoundsErrorOnDevice;
+
+template< class MapType >
+struct OperatorBoundsErrorOnDevice< MapType, false > {
+KOKKOS_INLINE_FUNCTION
+static void run(MapType const&) {
+  Kokkos::abort("View bounds error");
+}
+};
+
+template< class MapType >
+struct OperatorBoundsErrorOnDevice< MapType, true > {
+KOKKOS_INLINE_FUNCTION
+static void run(MapType const& map) {
+  char const* const user_alloc_start = reinterpret_cast<char const*>(map.data());
+  char const* const header_start = user_alloc_start - sizeof(SharedAllocationHeader);
+  SharedAllocationHeader const* const header =
+    reinterpret_cast<SharedAllocationHeader const*>(header_start);
+  char const* const label = header->label();
+  enum { LEN = 128 };
+  char msg[LEN];
+  char const* const first_part = "View bounds error of view ";
+  char* p = msg;
+  char* const end = msg + LEN - 1;
+  for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) {
+    *p = *p2;
+  }
+  for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) {
+    *p = *p2;
+  }
+  *p = '\0';
+  Kokkos::abort(msg);
+}
+};
+
+/* Check #2: does the ViewMapping have the printable_label_typedef defined?
+   See above that only the non-specialized standard-layout ViewMapping has
+   this defined by default.
+   The existence of this typedef indicates the existence of MapType::is_managed */
+template< class T, class Enable = void >
+struct has_printable_label_typedef : public std::false_type {};
+
+template<class T>
+struct has_printable_label_typedef<
+  T, typename enable_if_type<typename T::printable_label_typedef>::type>
+  : public std::true_type
+{};
+
+template< class MapType >
+KOKKOS_INLINE_FUNCTION
+void operator_bounds_error_on_device(
+    MapType const&,
+    std::false_type) {
+  Kokkos::abort("View bounds error");
+}
+
+template< class MapType >
+KOKKOS_INLINE_FUNCTION
+void operator_bounds_error_on_device(
+    MapType const& map,
+    std::true_type) {
+  OperatorBoundsErrorOnDevice< MapType >::run(map);
+}
+
+#endif // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
 template< class MemorySpace , class MapType , class ... Args >
 KOKKOS_INLINE_FUNCTION
 void view_verify_operator_bounds
@ -3166,7 +3233,17 @@ void view_verify_operator_bounds
    view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... );
    Kokkos::Impl::throw_runtime_exception(std::string(buffer));
 #else
-    Kokkos::abort("View bounds error");
+    /* Check #1: is there a SharedAllocationRecord?
+       (we won't use it, but if its not there then there isn't
+        a corresponding SharedAllocationHeader containing a label).
+       This check should cover the case of Views that don't
+       have the Unmanaged trait but were initialized by pointer. */
+    if (tracker.has_record()) {
+      operator_bounds_error_on_device<MapType>(
+          map, has_printable_label_typedef<MapType>());
+    } else {
+      Kokkos::abort("View bounds error");
+    }
 #endif
  }
 }
--- a/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -48,7 +48,6 @@
 //----------------------------------------------------------------------------

 namespace Kokkos {
-namespace Experimental {
 namespace Impl {

 // View mapping for rank two tiled array
@ -195,11 +194,9 @@ struct ViewMapping
 };

 } /* namespace Impl */
-} /* namespace Experimental */
 } /* namespace Kokkos */

 namespace Kokkos {
-namespace Experimental {

 template< typename T , unsigned N0 , unsigned N1 , class ... P >
 KOKKOS_INLINE_FUNCTION
@ -217,7 +214,6 @@ tile_subview( const Kokkos::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...> &
    ( src , SrcLayout() , i_tile0 , i_tile1 );
 }

-} /* namespace Experimental */
 } /* namespace Kokkos */

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
@ -1,183 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Macros.hpp>
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-
-#include <impl/Kokkos_spinwait.hpp>
-
-#include <Kokkos_Atomic.hpp>
-#include <impl/Kokkos_BitOps.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-#if !defined( _WIN32 )
-  #if defined( KOKKOS_ENABLE_ASM )
-    #if defined( __arm__ ) || defined( __aarch64__ )
-      /* No-operation instruction to idle the thread. */
-      #define KOKKOS_INTERNAL_PAUSE
-    #else
-      /* Pause instruction to prevent excess processor bus usage */
-      #define KOKKOS_INTERNAL_PAUSE   asm volatile("pause\n":::"memory")
-    #endif
-    #define KOKKOS_INTERNAL_NOP2    asm volatile("nop\n" "nop\n")
-    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
-    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
-    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
-    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
-    namespace {
-    inline void kokkos_internal_yield( const unsigned i ) noexcept {
-      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
-      case 0u:  KOKKOS_INTERNAL_NOP2;  break;
-      case 1u:  KOKKOS_INTERNAL_NOP4;  break;
-      case 2u:  KOKKOS_INTERNAL_NOP8;  break;
-      case 3u:  KOKKOS_INTERNAL_NOP16; break;
-      default: KOKKOS_INTERNAL_NOP32;
-      }
-      KOKKOS_INTERNAL_PAUSE;
-    }
-    }
-  #else
-    #include <sched.h>
-    namespace {
-    inline void kokkos_internal_yield( const unsigned ) noexcept {
-      sched_yield();
-    }
-    }
-  #endif
-#else // defined( _WIN32 )
-  #if defined ( KOKKOS_ENABLE_WINTHREAD )
-    #include <process.h>
-    namespace {
-    inline void kokkos_internal_yield( const unsigned ) noexcept {
-      Sleep(0);
-    }
-    }
-  #elif defined( _MSC_VER )
-    #define NOMINMAX
-    #include <winsock2.h>
-    #include <windows.h>
-    namespace {
-    inline void kokkos_internal_yield( const unsigned ) noexcept {
-      YieldProcessor();
-    }
-    }
-  #else
-    #define KOKKOS_INTERNAL_PAUSE   __asm__ __volatile__("pause\n":::"memory")
-    #define KOKKOS_INTERNAL_NOP2    __asm__ __volatile__("nop\n" "nop")
-    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
-    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
-    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
-    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
-    namespace {
-    inline void kokkos_internal_yield( const unsigned i ) noexcept {
-      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
-      case 0:  KOKKOS_INTERNAL_NOP2;  break;
-      case 1:  KOKKOS_INTERNAL_NOP4;  break;
-      case 2:  KOKKOS_INTERNAL_NOP8;  break;
-      case 3:  KOKKOS_INTERNAL_NOP16; break;
-      default: KOKKOS_INTERNAL_NOP32;
-      }
-      KOKKOS_INTERNAL_PAUSE;
-    }
-    }
-  #endif
-#endif
-
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  unsigned i = 0;
-  while ( value == flag ) {
-    kokkos_internal_yield(i);
-    ++i;
-  }
-  Kokkos::load_fence();
-}
-
-void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  unsigned i = 0;
-  while ( value != flag ) {
-    kokkos_internal_yield(i);
-    ++i;
-  }
-  Kokkos::load_fence();
-}
-
-void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  unsigned i = 0;
-  while ( value == flag ) {
-    kokkos_internal_yield(i);
-    ++i;
-  }
-  Kokkos::load_fence();
-}
-
-void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  unsigned i = 0;
-  while ( value != flag ) {
-    kokkos_internal_yield(i);
-    ++i;
-  }
-  Kokkos::load_fence();
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#else
-void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
-#endif
-