Updating Kokkos lib to 2.03.00

2017-04-25 13:48:51 -06:00
parent 9f6e126a2f
commit 8910ec6e59
261 changed files with 27816 additions and 17799 deletions
--- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
--- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@ -56,12 +56,13 @@ int bit_scan_forward( unsigned i )
 {
 #if defined( __CUDA_ARCH__ )
  return __ffs(i) - 1;
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_ffs(i) - 1;
-#elif defined( __INTEL_COMPILER )
+#elif defined( KOKKOS_COMPILER_INTEL )
  return _bit_scan_forward(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return __cnttz4(i);
+#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_ffs(i) - 1;
 #else
-
  unsigned t = 1u;
  int r = 0;
  while ( i && ( i & t == 0 ) )
@ -79,10 +80,12 @@ int bit_scan_reverse( unsigned i )
  enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
 #if defined( __CUDA_ARCH__ )
  return shift - __clz(i);
+#elif defined( KOKKOS_COMPILER_INTEL )
+  return _bit_scan_reverse(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return shift - __cntlz4(i);
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
  return shift - __builtin_clz(i);
-#elif defined( __INTEL_COMPILER )
-  return _bit_scan_reverse(i);
 #else
  unsigned t = 1u << shift;
  int r = 0;
@ -101,10 +104,12 @@ int bit_count( unsigned i )
 {
 #if defined( __CUDA_ARCH__ )
  return __popc(i);
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_popcount(i);
 #elif defined ( __INTEL_COMPILER )
  return _popcnt32(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return __popcnt4(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_popcount(i);
 #else
  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
  i = i - ( ( i >> 1 ) & ~0u / 3u );                             // temp
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -147,7 +147,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
  }
 #endif

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
 #endif
 }
@ -155,7 +155,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
 void finalize_internal( const bool all_spaces = false )
 {

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
 #endif

@ -449,5 +449,323 @@ void fence()
  Impl::fence_internal();
 }

+void print_configuration( std::ostream & out , const bool detail )
+{
+  std::ostringstream msg;
+
+  msg << "Compiler:" << std::endl;
+#ifdef KOKKOS_COMPILER_APPLECC
+  msg << "  KOKKOS_COMPILER_APPLECC: " << KOKKOS_COMPILER_APPLECC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_CLANG
+  msg << "  KOKKOS_COMPILER_CLANG: " << KOKKOS_COMPILER_CLANG << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_CRAYC
+  msg << "  KOKKOS_COMPILER_CRAYC: " << KOKKOS_COMPILER_CRAYC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_GNU
+  msg << "  KOKKOS_COMPILER_GNU: " << KOKKOS_COMPILER_GNU << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_IBM
+  msg << "  KOKKOS_COMPILER_IBM: " << KOKKOS_COMPILER_IBM << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_INTEL
+  msg << "  KOKKOS_COMPILER_INTEL: " << KOKKOS_COMPILER_INTEL << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_NVCC
+  msg << "  KOKKOS_COMPILER_NVCC: " << KOKKOS_COMPILER_NVCC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_PGI
+  msg << "  KOKKOS_COMPILER_PGI: " << KOKKOS_COMPILER_PGI << std::endl;
+#endif
+
+
+  msg << "Architecture:" << std::endl;
+#ifdef KOKKOS_ENABLE_ISA_KNC
+  msg << "  KOKKOS_ENABLE_ISA_KNC: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_KNC: no" << std::endl;
+#endif
+#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
+  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: no" << std::endl;
+#endif
+#ifdef KOKKOS_ENABLE_ISA_X86_64
+  msg << "  KOKKOS_ENABLE_ISA_X86_64: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_X86_64: no" << std::endl;
+#endif
+
+
+  msg << "Devices:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA: ";
+#ifdef KOKKOS_ENABLE_CUDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_OPENMP: ";
+#ifdef KOKKOS_ENABLE_OPENMP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PTHREAD: ";
+#ifdef KOKKOS_ENABLE_PTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_STDTHREAD: ";
+#ifdef KOKKOS_ENABLE_STDTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_WINTHREAD: ";
+#ifdef KOKKOS_ENABLE_WINTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_QTHREADS: ";
+#ifdef KOKKOS_ENABLE_QTHREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_SERIAL: ";
+#ifdef KOKKOS_ENABLE_SERIAL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Default Device:" << std::endl;
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Atomics:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_GNU_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_GNU_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_INTEL_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_OPENMP_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_WINDOWS_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Vectorization:" << std::endl;
+  msg << "  KOKKOS_ENABLE_PRAGMA_IVDEP: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_LOOPCOUNT: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_SIMD: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_UNROLL: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_VECTOR: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+  msg << "Memory:" << std::endl;
+  msg << "  KOKKOS_ENABLE_HBWSPACE: ";
+#ifdef KOKKOS_ENABLE_HBWSPACE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_INTEL_MM_ALLOC: ";
+#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_POSIX_MEMALIGN: ";
+#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Options:" << std::endl;
+  msg << "  KOKKOS_ENABLE_ASM: ";
+#ifdef KOKKOS_ENABLE_ASM
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CXX1Z: ";
+#ifdef KOKKOS_ENABLE_CXX1Z
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK: ";
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_HWLOC: ";
+#ifdef KOKKOS_ENABLE_HWLOC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_LIBRT: ";
+#ifdef KOKKOS_ENABLE_LIBRT
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_MPI: ";
+#ifdef KOKKOS_ENABLE_MPI
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PROFILING: ";
+#ifdef KOKKOS_ENABLE_PROFILING
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+  msg << "Cuda Options:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
+#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_UVM: ";
+#ifdef KOKKOS_ENABLE_CUDA_UVM
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUSPARSE: ";
+#ifdef KOKKOS_ENABLE_CUSPARSE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+#endif
+
+  msg << "\nRuntime Configuration:" << std::endl;
+#ifdef KOKKOS_ENABLE_CUDA
+  Cuda::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+  OpenMP::print_configuration(msg, detail);
+#endif
+#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( WINTHREAD )
+  Threads::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_QTHREADS
+  Qthreads::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_SERIAL
+  Serial::print_configuration(msg, detail);
+#endif
+
+  out << msg.str() << std::endl;
+}
+
 } // namespace Kokkos

--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@ -0,0 +1,653 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_FUNCTORANALYSIS_HPP
+#define KOKKOS_FUNCTORANALYSIS_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_Reducer.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct FunctorPatternInterface {
+  struct FOR {};
+  struct REDUCE {};
+  struct SCAN {};
+};
+
+/** \brief  Query Functor and execution policy argument tag for value type.
+ *
+ *  If 'value_type' is not explicitly declared in the functor
+ *  then attempt to deduce the type from FunctorType::operator()
+ *  interface used by the pattern and policy.
+ *
+ *  For the REDUCE pattern generate a Reducer and finalization function
+ *  derived from what is available within the functor.
+ */
+template< typename PatternInterface , class Policy , class Functor >
+struct FunctorAnalysis {
+private:
+
+  using FOR    = FunctorPatternInterface::FOR ;
+  using REDUCE = FunctorPatternInterface::REDUCE ;
+  using SCAN   = FunctorPatternInterface::SCAN ;
+
+  //----------------------------------------
+
+  struct VOID {};
+
+  template< typename P = Policy , typename = std::false_type >
+  struct has_work_tag
+    {
+      using type = void ;
+      using wtag = VOID ;
+    };
+
+  template< typename P >
+  struct has_work_tag
+    < P , typename std::is_same< typename P::work_tag , void >::type >
+    {
+      using type = typename P::work_tag ;
+      using wtag = typename P::work_tag ;
+    };
+
+  using Tag  = typename has_work_tag<>::type ;
+  using WTag = typename has_work_tag<>::wtag ;
+
+  //----------------------------------------
+  // Check for Functor::value_type, which is either a simple type T or T[]
+
+  template< typename F , typename = std::false_type >
+  struct has_value_type { using type = void ; };
+
+  template< typename F >
+  struct has_value_type
+    < F , typename std::is_same< typename F::value_type , void >::type >
+  {
+    using type = typename F::value_type ;
+
+    static_assert( ! std::is_reference< type >::value &&
+                   std::rank< type >::value <= 1 &&
+                   std::extent< type >::value == 0
+                 , "Kokkos Functor::value_type is T or T[]" );
+  };
+
+  //----------------------------------------
+  // If Functor::value_type does not exist then evaluate operator(),
+  // depending upon the pattern and whether the policy has a work tag,
+  // to determine the reduction or scan value_type.
+
+  template< typename F
+          , typename P = PatternInterface
+          , typename V = typename has_value_type<F>::type
+          , bool     T = std::is_same< Tag , void >::value
+          >
+  struct deduce_value_type { using type = V ; };
+
+  template< typename F >
+  struct deduce_value_type< F , REDUCE , void , true > {
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , A & ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , REDUCE , void , false > {
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , A & ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , SCAN , void , true > {
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , A & , I ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , SCAN , void , false > {
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , A & , I ) const );
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , A & , I ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  //----------------------------------------
+
+  using candidate_type = typename deduce_value_type< Functor >::type ;
+
+  enum { candidate_is_void  = std::is_same< candidate_type , void >::value
+       , candidate_is_array = std::rank< candidate_type >::value == 1 };
+
+  //----------------------------------------
+
+public:
+
+  using value_type = typename std::remove_extent< candidate_type >::type ;
+
+  static_assert( ! std::is_const< value_type >::value
+               , "Kokkos functor operator reduce argument cannot be const" );
+
+private:
+
+  // Stub to avoid defining a type 'void &'
+  using ValueType = typename
+    std::conditional< candidate_is_void , VOID , value_type >::type ;
+
+public:
+
+  using pointer_type = typename
+    std::conditional< candidate_is_void , void , ValueType * >::type ;
+
+  using reference_type = typename
+    std::conditional< candidate_is_array  , ValueType * , typename
+    std::conditional< ! candidate_is_void , ValueType & , void >
+    ::type >::type ;
+
+private:
+
+  template< bool IsArray , class FF >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< IsArray , unsigned >::type
+  get_length( FF const & f ) { return f.value_count ; }
+
+  template< bool IsArray , class FF >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< ! IsArray , unsigned >::type
+  get_length( FF const & ) { return 1 ; }
+
+public:
+
+  enum { StaticValueSize = ! candidate_is_void &&
+                           ! candidate_is_array
+                         ? sizeof(ValueType) : 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const Functor & f )
+    { return FunctorAnalysis::template get_length< candidate_is_array >(f); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const Functor & f )
+    { return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }
+
+  //----------------------------------------
+
+  template< class Unknown >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const Unknown & )
+    { return 1 ; }
+
+  template< class Unknown >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const Unknown & )
+    { return sizeof(ValueType); }
+
+private:
+
+  enum INTERFACE : int
+    { DISABLE           = 0
+    , NO_TAG_NOT_ARRAY  = 1
+    , NO_TAG_IS_ARRAY   = 2
+    , HAS_TAG_NOT_ARRAY = 3
+    , HAS_TAG_IS_ARRAY  = 4
+    , DEDUCED =
+       ! std::is_same< PatternInterface , REDUCE >::value ? DISABLE : (
+       std::is_same<Tag,void>::value
+         ? (candidate_is_array ? NO_TAG_IS_ARRAY  : NO_TAG_NOT_ARRAY)
+         : (candidate_is_array ? HAS_TAG_IS_ARRAY : HAS_TAG_NOT_ARRAY) )
+    };
+
+  //----------------------------------------
+  // parallel_reduce join operator
+
+  template< class F , INTERFACE >
+  struct has_join_function ;
+
+  template< class F >
+  struct has_join_function< F , NO_TAG_NOT_ARRAY >
+    {
+      typedef volatile       ValueType & vref_type ;
+      typedef volatile const ValueType & cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( *dst , *src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , NO_TAG_IS_ARRAY >
+    {
+      typedef volatile       ValueType * vref_type ;
+      typedef volatile const ValueType * cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( dst , src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      typedef volatile       ValueType & vref_type ;
+      typedef volatile const ValueType & cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( WTag() , *dst , *src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , HAS_TAG_IS_ARRAY >
+    {
+      typedef volatile       ValueType * vref_type ;
+      typedef volatile const ValueType * cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( WTag() , dst , src ); }
+    };
+
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceJoin
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+       {
+         const int n = FunctorAnalysis::value_count( f );
+         for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
+       }
+    };
+
+  template< class F >
+  struct DeduceJoin< F , DISABLE , void >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const &
+               , ValueType volatile *
+               , ValueType volatile const * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceJoin< F , I ,
+    decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
+    : public has_join_function<F,I> {};
+
+  //----------------------------------------
+
+  template< class , INTERFACE >
+  struct has_init_function ;
+
+  template< class F >
+  struct has_init_function< F , NO_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( *dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , NO_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( WTag(), *dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , HAS_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( WTag(), dst ); }
+    };
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceInit
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & , ValueType * dst ) { new(dst) ValueType(); }
+    };
+
+  template< class F >
+  struct DeduceInit< F , DISABLE , void >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & , ValueType * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceInit< F , I ,
+    decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
+    : public has_init_function<F,I> {};
+
+  //----------------------------------------
+
+public:
+
+  struct Reducer
+  {
+  private:
+
+    Functor     const & m_functor ;
+    ValueType * const   m_result ;
+    int         const   m_length ;
+
+  public:
+
+    using reducer        = Reducer ;
+    using value_type     = FunctorAnalysis::value_type ;
+    using memory_space   = void ;
+    using reference_type = FunctorAnalysis::reference_type ;
+
+    KOKKOS_INLINE_FUNCTION
+    void join( ValueType volatile * dst
+             , ValueType volatile const * src ) const noexcept
+      { DeduceJoin<>::join( m_functor , dst , src ); }
+
+    KOKKOS_INLINE_FUNCTION
+    void init( ValueType * dst ) const noexcept
+      { DeduceInit<>::init( m_functor , dst ); }
+
+    KOKKOS_INLINE_FUNCTION explicit
+    constexpr Reducer( Functor const & arg_functor
+                     , ValueType     * arg_value = 0
+                     , int             arg_length = 0 ) noexcept
+      : m_functor( arg_functor ), m_result(arg_value), m_length(arg_length) {}
+
+    KOKKOS_INLINE_FUNCTION
+    constexpr int length() const noexcept { return m_length ; }
+
+    KOKKOS_INLINE_FUNCTION
+    ValueType & operator[]( int i ) const noexcept
+      { return m_result[i]; }
+
+  private:
+
+    template< bool IsArray >
+    constexpr
+    typename std::enable_if< IsArray , ValueType * >::type
+    ref() const noexcept { return m_result ; }
+
+    template< bool IsArray >
+    constexpr
+    typename std::enable_if< ! IsArray , ValueType & >::type
+    ref() const noexcept { return *m_result ; }
+
+  public:
+
+    KOKKOS_INLINE_FUNCTION
+    auto result() const noexcept
+      -> decltype( Reducer::template ref< candidate_is_array >() )
+      { return Reducer::template ref< candidate_is_array >(); }
+ };
+
+  //----------------------------------------
+
+private:
+
+  template< class , INTERFACE >
+  struct has_final_function ;
+
+  // No tag, not array
+  template< class F >
+  struct has_final_function< F , NO_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( *dst ); }
+    };
+
+  // No tag, is array
+  template< class F >
+  struct has_final_function< F , NO_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( dst ); }
+    };
+
+  // Has tag, not array
+  template< class F >
+  struct has_final_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( WTag(), *dst ); }
+    };
+
+  // Has tag, is array
+  template< class F >
+  struct has_final_function< F , HAS_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( WTag(), dst ); }
+    };
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceFinal
+    {
+      KOKKOS_INLINE_FUNCTION
+      static void final( F const & , ValueType * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceFinal< F , I ,
+    decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
+    : public has_init_function<F,I> {};
+
+public:
+
+  static void final( Functor const & f , ValueType * result )
+    { DeduceFinal<>::final( f , result ); }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_FUNCTORANALYSIS_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -62,7 +62,7 @@
 #include <memkind.h>
 #endif

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif

@ -198,7 +198,7 @@ void * HBWSpace::allocate( const size_t arg_alloc_size ) const
    case STD_MALLOC: msg << "STD_MALLOC" ; break ;
    }
    msg << " ]( " << arg_alloc_size << " ) FAILED" ;
-    if ( ptr == NULL ) { msg << " NULL" ; } 
+    if ( ptr == NULL ) { msg << " NULL" ; }
    else { msg << " NOT ALIGNED " << ptr ; }

    std::cerr << msg.str() << std::endl ;
@ -218,7 +218,7 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
    if ( m_alloc_mech == STD_MALLOC ) {
      void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
      memkind_free(MEMKIND_TYPE, alloc_ptr );
-    }    
+    }

  }
 }
@ -249,7 +249,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::deallocateData(
      Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::HBWSpace::name()),RecordBase::m_alloc_ptr->m_label,
@ -278,7 +278,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
      )
  , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
  }
@ -297,7 +297,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space

 void * SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
 allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
-                , const std::string & arg_alloc_label 
+                , const std::string & arg_alloc_label
                , const size_t arg_alloc_size )
 {
  if ( ! arg_alloc_size ) return (void *) 0 ;
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,14 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

 #include <algorithm>
 #include <Kokkos_Macros.hpp>
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif
 /*--------------------------------------------------------------------------*/
@ -292,7 +292,7 @@ void * HostSpace::allocate( const size_t arg_alloc_size ) const
    case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC" ; break ;
    }
    msg << " ]( " << arg_alloc_size << " ) FAILED" ;
-    if ( ptr == NULL ) { msg << " NULL" ; } 
+    if ( ptr == NULL ) { msg << " NULL" ; }
    else { msg << " NOT ALIGNED " << ptr ; }

    std::cerr << msg.str() << std::endl ;
@ -312,7 +312,7 @@ void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_
    if ( m_alloc_mech == STD_MALLOC ) {
      void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
      free( alloc_ptr );
-    }    
+    }

 #if defined( KOKKOS_ENABLE_INTEL_MM_ALLOC )
    else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
@ -359,7 +359,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::HostSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::deallocateData(
      Kokkos::Profiling::SpaceHandle(Kokkos::HostSpace::name()),RecordBase::m_alloc_ptr->m_label,
@ -388,7 +388,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space
      )
  , m_space( arg_space )
 {
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
   }
@ -406,7 +406,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space

 void * SharedAllocationRecord< Kokkos::HostSpace , void >::
 allocate_tracked( const Kokkos::HostSpace & arg_space
-                , const std::string & arg_alloc_label 
+                , const std::string & arg_alloc_label
                , const size_t arg_alloc_size )
 {
  if ( ! arg_alloc_size ) return (void *) 0 ;
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@ -0,0 +1,463 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <limits>
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void HostThreadTeamData::organize_pool
+  ( HostThreadTeamData * members[] , const int size )
+{
+  bool ok = true ;
+
+  // Verify not already a member of a pool:
+  for ( int rank = 0 ; rank < size && ok ; ++rank ) {
+    ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
+  }
+
+  if ( ok ) {
+
+    int64_t * const root_scratch = members[0]->m_scratch ;
+
+    for ( int i = m_pool_rendezvous ; i < m_pool_reduce ; ++i ) {
+      root_scratch[i] = 0 ;
+    }
+
+    {
+      HostThreadTeamData ** const pool =
+        (HostThreadTeamData **) (root_scratch + m_pool_members);
+
+      // team size == 1, league size == pool_size
+
+      for ( int rank = 0 ; rank < size ; ++rank ) {
+        HostThreadTeamData * const mem = members[ rank ] ;
+        mem->m_pool_scratch = root_scratch ;
+        mem->m_team_scratch = mem->m_scratch ;
+        mem->m_pool_rank    = rank ;
+        mem->m_pool_size    = size ;
+        mem->m_team_base    = rank ;
+        mem->m_team_rank    = 0 ;
+        mem->m_team_size    = 1 ;
+        mem->m_team_alloc   = 1 ;
+        mem->m_league_rank  = rank ;
+        mem->m_league_size  = size ;
+        mem->m_pool_rendezvous_step = 0 ;
+        mem->m_team_rendezvous_step = 0 ;
+        pool[ rank ] = mem ;
+      }
+    }
+
+    Kokkos::memory_fence();
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_pool ERROR pool already exists");
+  }
+}
+
+void HostThreadTeamData::disband_pool()
+{
+   m_work_range.first  = -1 ;
+   m_work_range.second = -1 ;
+   m_pool_scratch = 0 ;
+   m_team_scratch = 0 ;
+   m_pool_rank    = 0 ;
+   m_pool_size    = 1 ;
+   m_team_base    = 0 ;
+   m_team_rank    = 0 ;
+   m_team_size    = 1 ;
+   m_team_alloc   = 1 ;
+   m_league_rank  = 0 ;
+   m_league_size  = 1 ;
+   m_pool_rendezvous_step = 0 ;
+   m_team_rendezvous_step = 0 ;
+}
+
+int HostThreadTeamData::organize_team( const int team_size )
+{
+  // Pool is initialized
+  const bool ok_pool = 0 != m_pool_scratch ;
+
+  // Team is not set
+  const bool ok_team =
+    m_team_scratch == m_scratch &&
+    m_team_base    == m_pool_rank &&
+    m_team_rank    == 0 &&
+    m_team_size    == 1 &&
+    m_team_alloc   == 1 &&
+    m_league_rank  == m_pool_rank &&
+    m_league_size  == m_pool_size ;
+
+  if ( ok_pool && ok_team ) {
+
+    if ( team_size <= 0 ) return 0 ; // No teams to organize
+
+    if ( team_size == 1 ) return 1 ; // Already organized in teams of one
+
+    HostThreadTeamData * const * const pool =
+      (HostThreadTeamData **) (m_pool_scratch + m_pool_members);
+
+    // "league_size" in this context is the number of concurrent teams
+    // that the pool can accommodate.  Excess threads are idle.
+    const int league_size     = m_pool_size / team_size ;
+    const int team_alloc_size = m_pool_size / league_size ;
+    const int team_alloc_rank = m_pool_rank % team_alloc_size ;
+    const int league_rank     = m_pool_rank / team_alloc_size ;
+    const int team_base_rank  = league_rank * team_alloc_size ;
+
+    m_team_scratch = pool[ team_base_rank ]->m_scratch ;
+    m_team_base    = team_base_rank ;
+    // This needs to check overflow, if m_pool_size % team_alloc_size !=0
+    // there are two corner cases:
+    // (i) if team_alloc_size == team_size there might be a non-full
+    //     zombi team around (for example m_pool_size = 5 and team_size = 2
+    // (ii) if team_alloc > team_size then the last team might have less
+    //      threads than the others
+    m_team_rank    = ( team_base_rank + team_size <= m_pool_size ) &&
+                     ( team_alloc_rank < team_size ) ?
+                     team_alloc_rank : -1;
+    m_team_size    = team_size ;
+    m_team_alloc   = team_alloc_size ;
+    m_league_rank  = league_rank ;
+    m_league_size  = league_size ;
+    m_team_rendezvous_step = 0 ;
+
+    if ( team_base_rank == m_pool_rank ) {
+      // Initialize team's rendezvous memory
+      for ( int i = m_team_rendezvous ; i < m_pool_reduce ; ++i ) {
+        m_scratch[i] = 0 ;
+      }
+      // Make sure team's rendezvous memory initialized
+      // is written before proceeding.
+      Kokkos::memory_fence();
+    }
+
+    // Organizing threads into a team performs a barrier across the
+    // entire pool to insure proper initialization of the team
+    // rendezvous mechanism before a team rendezvous can be performed.
+
+    if ( pool_rendezvous() ) {
+      pool_rendezvous_release();
+    }
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_team ERROR");
+  }
+
+  return 0 <= m_team_rank ;
+}
+
+void HostThreadTeamData::disband_team()
+{
+  m_team_scratch = m_scratch ;
+  m_team_base    = m_pool_rank ;
+  m_team_rank    = 0 ;
+  m_team_size    = 1 ;
+  m_team_alloc   = 1 ;
+  m_league_rank  = m_pool_rank ;
+  m_league_size  = m_pool_size ;
+  m_team_rendezvous_step = 0 ;
+}
+
+//----------------------------------------------------------------------------
+/* pattern for rendezvous
+ *
+ *  if ( rendezvous() ) {
+ *     ... all other threads are still in team_rendezvous() ...
+ *     rendezvous_release();
+ *     ... all other threads are released from team_rendezvous() ...
+ *  }
+ */
+
+int HostThreadTeamData::rendezvous( int64_t * const buffer
+                                  , int & rendezvous_step
+                                  , int const size
+                                  , int const rank ) noexcept
+{
+  enum : int { shift_byte = 3 };
+  enum : int { size_byte  = ( 01 << shift_byte ) }; // == 8
+  enum : int { mask_byte  = size_byte - 1 };
+
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+
+  // Cycle step values: 1 <= step <= size_val_cycle
+  // An odd multiple of memory cycle so that when a memory location
+  // is reused it has a different value.
+  // Must be representable within a single byte: size_val_cycle < 16
+
+  enum : int { size_val_cycle = 3 * size_mem_cycle };
+
+  // Requires:
+  //   Called by rank = [ 0 .. size )
+  //   buffer aligned to int64_t[4]
+
+  // A sequence of rendezvous uses four cycled locations in memory
+  // and non-equal cycled synchronization values to
+  // 1) prevent rendezvous from overtaking one another and
+  // 2) give each spin wait location an int64_t[4] span
+  //    so that it has its own cache line.
+
+  const int step = ( rendezvous_step % size_val_cycle ) + 1 ;
+
+  rendezvous_step = step ;
+
+  // The leading int64_t[4] span is for thread 0 to write
+  // and all other threads to read spin-wait.
+  // sync_offset is the index into this array for this step.
+
+  const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;
+
+  union {
+    int64_t full ;
+    int8_t  byte[8] ;
+  } value ;
+
+  if ( rank ) {
+
+    const int group_begin = rank << shift_byte ; // == rank * size_byte
+
+    if ( group_begin < size ) {
+
+      //  This thread waits for threads
+      //   [ group_begin .. group_begin + 8 )
+      //   [ rank*8      .. rank*8 + 8      )
+      // to write to their designated bytes.
+
+      const int end = group_begin + size_byte < size
+                    ? size_byte : size - group_begin ;
+
+      value.full = 0 ;
+      for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+
+      store_fence(); // This should not be needed but fixes #742
+
+      spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
+                          , value.full );
+    }
+
+    {
+      // This thread sets its designated byte.
+      //   ( rank % size_byte ) +
+      //   ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
+      //   ( sync_offset * size_byte )
+      const int offset = ( rank & mask_byte )
+                       + ( ( rank & ~mask_byte ) << shift_mem_cycle )
+                       + ( sync_offset << shift_byte );
+
+      // All of this thread's previous memory stores must be complete before
+      // this thread stores the step value at this thread's designated byte
+      // in the shared synchronization array.
+
+      Kokkos::memory_fence();
+
+      ((volatile int8_t*) buffer)[ offset ] = int8_t( step );
+
+      // Memory fence to push the previous store out
+      Kokkos::memory_fence();
+    }
+
+    // Wait for thread 0 to release all other threads
+
+    spinwait_until_equal( buffer[ step & mask_mem_cycle ] , int64_t(step) );
+
+  }
+  else {
+    // Thread 0 waits for threads [1..7]
+    // to write to their designated bytes.
+
+    const int end = size_byte < size ? 8 : size ;
+
+    value.full = 0 ;
+    for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+
+    spinwait_until_equal( buffer[ sync_offset ], value.full );
+  }
+
+  return rank ? 0 : 1 ;
+}
+
+void HostThreadTeamData::
+  rendezvous_release( int64_t * const buffer
+                    , int const rendezvous_step ) noexcept
+{
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+
+  // Requires:
+  //   Called after team_rendezvous
+  //   Called only by true == team_rendezvous(root)
+
+  // Memory fence to be sure all previous writes are complete:
+  Kokkos::memory_fence();
+
+  ((volatile int64_t*) buffer)[ rendezvous_step & mask_mem_cycle ] =
+     int64_t( rendezvous_step );
+
+  // Memory fence to push the store out
+  Kokkos::memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+int HostThreadTeamData::get_work_stealing() noexcept
+{
+  pair_int_t w( -1 , -1 );
+
+  if ( 1 == m_team_size || team_rendezvous() ) {
+
+    // Attempt first from beginning of my work range
+    for ( int attempt = m_work_range.first < m_work_range.second ; attempt ; ) {
+
+      // Query and attempt to update m_work_range
+      //   from: [ w.first     , w.second )
+      //   to:   [ w.first + 1 , w.second ) = w_new
+      //
+      // If w is invalid then is just a query.
+
+      const pair_int_t w_new( w.first + 1 , w.second );
+
+      w = Kokkos::atomic_compare_exchange( & m_work_range, w, w_new );
+
+      if ( w.first < w.second ) {
+        // m_work_range is viable
+
+        // If steal is successful then don't repeat attempt to steal
+        attempt = ! ( w_new.first  == w.first + 1 &&
+                      w_new.second == w.second );
+      }
+      else {
+        // m_work_range is not viable
+        w.first  = -1 ;
+        w.second = -1 ;
+
+        attempt = 0 ;
+      }
+    }
+
+    if ( w.first == -1 && m_steal_rank != m_pool_rank ) {
+
+      HostThreadTeamData * const * const pool =
+        (HostThreadTeamData**)( m_pool_scratch + m_pool_members );
+
+      // Attempt from begining failed, try to steal from end of neighbor
+
+      pair_int_t volatile * steal_range =
+        & ( pool[ m_steal_rank ]->m_work_range );
+
+      for ( int attempt = true ; attempt ; ) {
+
+        // Query and attempt to update steal_work_range
+        //   from: [ w.first , w.second )
+        //   to:   [ w.first , w.second - 1 ) = w_new
+        //
+        // If w is invalid then is just a query.
+
+        const pair_int_t w_new( w.first , w.second - 1 );
+
+        w = Kokkos::atomic_compare_exchange( steal_range, w, w_new );
+
+        if ( w.first < w.second ) {
+          // steal_work_range is viable
+
+          // If steal is successful then don't repeat attempt to steal
+          attempt = ! ( w_new.first  == w.first &&
+                        w_new.second == w.second - 1 );
+        }
+        else {
+          // steal_work_range is not viable, move to next member
+          w.first  = -1 ;
+          w.second = -1 ;
+
+          // We need to figure out whether the next team is active
+          // m_steal_rank + m_team_alloc could be the next base_rank to steal from
+          // but only if there are another m_team_size threads available so that that
+          // base rank has a full team.
+          m_steal_rank = m_steal_rank + m_team_alloc + m_team_size <= m_pool_size ?
+                         m_steal_rank + m_team_alloc : 0;
+
+          steal_range = & ( pool[ m_steal_rank ]->m_work_range );
+
+          // If tried all other members then don't repeat attempt to steal
+          attempt = m_steal_rank != m_pool_rank ;
+        }
+      }
+
+      if ( w.first != -1 ) w.first = w.second - 1 ;
+    }
+
+    if ( 1 < m_team_size ) {
+      // Must share the work index
+      *((int volatile *) team_reduce()) = w.first ;
+
+      team_rendezvous_release();
+    }
+  }
+  else if ( 1 < m_team_size ) {
+    w.first = *((int volatile *) team_reduce());
+  }
+
+  // May exit because successfully stole work and w is good.
+  // May exit because no work left to steal and w = (-1,-1).
+
+#if 0
+fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
+       , m_pool_rank , m_pool_size , w.first );
+fflush(stdout);
+#endif
+
+  return w.first ;
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@ -52,6 +52,10 @@ void memory_fence()
 {
 #if defined( __CUDA_ARCH__ )
  __threadfence();
+#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
+  asm volatile (
+	  "mfence" ::: "memory"
+  );
 #elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
      ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
  __sync_synchronize();
@ -76,8 +80,8 @@ void store_fence()
 {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
  asm volatile (
-	"sfence" ::: "memory"
-  	);
+	  "sfence" ::: "memory"
+  );
 #else
  memory_fence();
 #endif
@ -93,8 +97,8 @@ void load_fence()
 {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
  asm volatile (
-	"lfence" ::: "memory"
-  	);
+	  "lfence" ::: "memory"
+  );
 #else
  memory_fence();
 #endif
--- a/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
@ -129,8 +129,8 @@
 #endif

 #ifdef KOKKOS_HAVE_CUDA_RDC
-#ifndef KOKKOS_ENABLE_CUDA_RDC
-#define KOKKOS_ENABLE_CUDA_RDC KOKKOS_HAVE_CUDA_RDC
+#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE KOKKOS_HAVE_CUDA_RDC
 #endif
 #endif

@ -242,9 +242,9 @@
 #endif
 #endif

-#ifdef KOKKOS_HAVE_QTHREAD
-#ifndef KOKKOS_ENABLE_QTHREAD
-#define KOKKOS_ENABLE_QTHREAD KOKKOS_HAVE_QTHREAD
+#ifdef KOKKOS_HAVE_QTHREADS
+#ifndef KOKKOS_ENABLE_QTHREADS
+#define KOKKOS_ENABLE_QTHREADS KOKKOS_HAVE_QTHREADS
 #endif
 #endif

--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@ -43,7 +43,7 @@

 #include <impl/Kokkos_Profiling_Interface.hpp>

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <string.h>

 namespace Kokkos {
@ -84,21 +84,21 @@ namespace Kokkos {
            (*endScanCallee)(kernelID);
        }
    }
-    
+
    void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
        if(NULL != beginReduceCallee) {
            Kokkos::fence();
            (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
        }
    }
-    
+
    void endParallelReduce(const uint64_t kernelID) {
        if(NULL != endReduceCallee) {
            Kokkos::fence();
            (*endReduceCallee)(kernelID);
        }
    }
-    
+

    void pushRegion(const std::string& kName) {
      if( NULL != pushRegionCallee ) {
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@ -50,7 +50,7 @@
 #include <string>
 #include <cinttypes>

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_DeviceInfo.hpp>
 #include <dlfcn.h>
 #include <iostream>
@ -59,7 +59,7 @@

 #define KOKKOSP_INTERFACE_VERSION 20150628

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 namespace Kokkos {
  namespace Profiling {

--- a/lib/kokkos/core/src/impl/Kokkos_Reducer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Reducer.hpp
@ -0,0 +1,317 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_REDUCER_HPP
+#define KOKKOS_IMPL_REDUCER_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+/*  Reducer abstraction:
+ *  1) Provides 'join' operation
+ *  2) Provides 'init' operation
+ *  3) Provides 'copy' operation
+ *  4) Optionally provides result value in a memory space
+ *
+ *  Created from:
+ *  1) Functor::operator()( destination , source )
+ *  2) Functor::{ join , init )
+ */
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename value_type >
+struct ReduceSum
+{
+  KOKKOS_INLINE_FUNCTION static
+  void copy( value_type & dest
+           , value_type const & src ) noexcept
+    { dest = src ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  void init( value_type & dest ) noexcept
+    { new( &dest ) value_type(); }
+
+  KOKKOS_INLINE_FUNCTION static
+  void join( value_type volatile & dest
+           , value_type const volatile & src ) noexcept
+    { dest += src ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  void join( value_type & dest
+           , value_type const & src ) noexcept
+    { dest += src ; }
+};
+
+template< typename T
+        , class ReduceOp = ReduceSum< T >
+        , typename MemorySpace = void >
+struct Reducer
+  : private ReduceOp
+  , private integral_nonzero_constant
+    < int , ( std::rank<T>::value == 1 ? std::extent<T>::value : 1 )>
+{
+private:
+
+  // Determine if T is simple array
+
+  enum : int { rank = std::rank<T>::value };
+
+  static_assert( rank <= 1 , "Kokkos::Impl::Reducer type is at most rank-one" );
+
+  using length_t =
+    integral_nonzero_constant<int,( rank == 1 ? std::extent<T>::value : 1 )> ;
+
+public:
+
+  using reducer        = Reducer ;
+  using memory_space   = MemorySpace ;
+  using value_type     = typename std::remove_extent<T>::type ;
+  using reference_type =
+    typename std::conditional< ( rank != 0 )
+                             , value_type *
+                             , value_type &
+                             >::type ;
+private:
+
+  //--------------------------------------------------------------------------
+  // Determine what functions 'ReduceOp' provides:
+  //   copy( destination , source )
+  //   init( destination )
+  //
+  //   operator()( destination , source )
+  //   join( destination , source )
+  //
+  // Provide defaults for missing optional operations
+
+  template< class R , typename = void>
+  struct COPY {
+    KOKKOS_INLINE_FUNCTION static
+    void copy( R const &
+             , value_type * dst
+             , value_type const * src ) { *dst = *src ; }
+  };
+
+  template< class R >
+  struct COPY< R , decltype( ((R*)0)->copy( *((value_type*)0)
+                                          , *((value_type const *)0) ) ) >
+  {
+    KOKKOS_INLINE_FUNCTION static
+    void copy( R const & r
+             , value_type * dst
+             , value_type const * src ) { r.copy( *dst , *src ); }
+  };
+
+  template< class R , typename = void >
+  struct INIT {
+    KOKKOS_INLINE_FUNCTION static
+    void init( R const & , value_type * dst ) { new(dst) value_type(); }
+  };
+
+  template< class R >
+  struct INIT< R , decltype( ((R*)0)->init( *((value_type*)0 ) ) ) >
+  {
+    KOKKOS_INLINE_FUNCTION static
+    void init( R const & r , value_type * dst ) { r.init( *dst ); }
+  };
+
+  template< class R , typename V , typename = void > struct JOIN
+    {
+      // If no join function then try operator()
+      KOKKOS_INLINE_FUNCTION static
+      void join( R const & r , V * dst , V const * src )
+        { r.operator()(*dst,*src); }
+    };
+
+  template< class R , typename V >
+  struct JOIN< R , V , decltype( ((R*)0)->join ( *((V *)0) , *((V const *)0) ) ) >
+    {
+      // If has join function use it
+      KOKKOS_INLINE_FUNCTION static
+      void join( R const & r , V * dst , V const * src )
+        { r.join(*dst,*src); }
+    };
+
+  //--------------------------------------------------------------------------
+
+  value_type * const m_result ;
+
+  template< int Rank >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr
+  typename std::enable_if< ( 0 != Rank ) , reference_type >::type
+  ref( value_type * p ) noexcept { return p ; }
+
+  template< int Rank >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr
+  typename std::enable_if< ( 0 == Rank ) , reference_type >::type
+  ref( value_type * p ) noexcept { return *p ; }
+
+public:
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr int length() const noexcept
+     { return length_t::value ; }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type * data() const noexcept
+    { return m_result ; }
+
+  KOKKOS_INLINE_FUNCTION
+  reference_type reference() const noexcept
+    { return Reducer::template ref< rank >( m_result ); }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void copy( value_type * const dest
+           , value_type const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template COPY<ReduceOp>::copy( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type * dest ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template INIT<ReduceOp>::init( (ReduceOp &) *this , dest + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type * const dest
+           , value_type const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template JOIN<ReduceOp,value_type>::join( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type volatile * const dest
+           , value_type volatile const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template JOIN<ReduceOp,value_type volatile>::join( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  //--------------------------------------------------------------------------
+
+  template< typename ArgT >
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer
+    ( ArgT * arg_value
+    , typename std::enable_if
+        < std::is_same<ArgT,value_type>::value &&
+          std::is_default_constructible< ReduceOp >::value
+        , int >::type arg_length = 1
+    ) noexcept
+    : ReduceOp(), length_t( arg_length ), m_result( arg_value ) {}
+
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer( ReduceOp const & arg_op
+                   , value_type     * arg_value = 0
+                   , int arg_length = 1 ) noexcept
+    : ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
+
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer( ReduceOp      && arg_op
+                   , value_type     * arg_value = 0
+                   , int arg_length = 1 ) noexcept
+    : ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
+
+  Reducer( Reducer const & ) = default ;
+  Reducer( Reducer && ) = default ;
+  Reducer & operator = ( Reducer const & ) = default ;
+  Reducer & operator = ( Reducer && ) = default ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename ValueType >
+constexpr
+Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >
+Sum( ValueType & arg_value )
+{
+  static_assert( std::is_trivial<ValueType>::value
+               , "Kokkos reducer requires trivial value type" );
+  return Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & arg_value );
+}
+
+template< typename ValueType >
+constexpr
+Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >
+Sum( ValueType * arg_value , int arg_length )
+{
+  static_assert( std::is_trivial<ValueType>::value
+               , "Kokkos reducer requires trivial value type" );
+  return Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >( arg_value , arg_length );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ValueType , class JoinType >
+Impl::Reducer< ValueType , JoinType >
+reducer( ValueType & value , JoinType const & lambda )
+{
+  return Impl::Reducer< ValueType , JoinType >( lambda , & value );
+}
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_IMPL_REDUCER_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -53,63 +53,126 @@

 namespace Kokkos {
 namespace Impl {
-namespace SerialImpl {
+namespace {

-Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
+HostThreadTeamData g_serial_thread_team_data ;

-Sentinel::~Sentinel()
-{
-  if ( m_scratch ) { free( m_scratch ); }
-  m_scratch = 0 ;
-  m_reduce_end = 0 ;
-  m_shared_end = 0 ;
 }

-Sentinel & Sentinel::singleton()
+// Resize thread team data scratch memory
+void serial_resize_thread_team_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes )
 {
-  static Sentinel s ; return s ;
+  if ( pool_reduce_bytes < 512 ) pool_reduce_bytes = 512 ;
+  if ( team_reduce_bytes < 512 ) team_reduce_bytes = 512 ;
+
+  const size_t old_pool_reduce  = g_serial_thread_team_data.pool_reduce_bytes();
+  const size_t old_team_reduce  = g_serial_thread_team_data.team_reduce_bytes();
+  const size_t old_team_shared  = g_serial_thread_team_data.team_shared_bytes();
+  const size_t old_thread_local = g_serial_thread_team_data.thread_local_bytes();
+  const size_t old_alloc_bytes  = g_serial_thread_team_data.scratch_bytes();
+
+  // Allocate if any of the old allocation is tool small:
+
+  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
+                        ( old_team_reduce  < team_reduce_bytes ) ||
+                        ( old_team_shared  < team_shared_bytes ) ||
+                        ( old_thread_local < thread_local_bytes );
+
+  if ( allocate ) {
+
+    Kokkos::HostSpace space ;
+
+    if ( old_alloc_bytes ) {
+      g_serial_thread_team_data.disband_team();
+      g_serial_thread_team_data.disband_pool();
+
+      space.deallocate( g_serial_thread_team_data.scratch_buffer()
+                      , g_serial_thread_team_data.scratch_bytes() );
+    }
+
+    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
+    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
+    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
+    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
+
+    const size_t alloc_bytes =
+      HostThreadTeamData::scratch_size( pool_reduce_bytes
+                                      , team_reduce_bytes
+                                      , team_shared_bytes
+                                      , thread_local_bytes );
+
+    void * const ptr = space.allocate( alloc_bytes );
+
+    g_serial_thread_team_data.
+      scratch_assign( ((char *)ptr)
+                    , alloc_bytes
+                    , pool_reduce_bytes
+                    , team_reduce_bytes
+                    , team_shared_bytes
+                    , thread_local_bytes );
+
+    HostThreadTeamData * pool[1] = { & g_serial_thread_team_data };
+
+    g_serial_thread_team_data.organize_pool( pool , 1 );
+    g_serial_thread_team_data.organize_team(1);
+  }
 }

-inline
-unsigned align( unsigned n )
+// Get thread team data structure for omp_get_thread_num()
+HostThreadTeamData * serial_get_thread_team_data()
 {
-  enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
-  return ( n + MASK ) & ~MASK ;
+  return & g_serial_thread_team_data ;
 }

-} // namespace
-
-SerialTeamMember::SerialTeamMember( int arg_league_rank
-                                  , int arg_league_size
-                                  , int arg_shared_size
-                                  )
-  : m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
-           , arg_shared_size )
-  , m_league_rank( arg_league_rank )
-  , m_league_size( arg_league_size )
-{}
-
 } // namespace Impl
+} // namespace Kokkos

-void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+int Serial::is_initialized()
 {
-  static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
+  return 1 ;
+}

-  reduce_size = Impl::SerialImpl::align( reduce_size );
-  shared_size = Impl::SerialImpl::align( shared_size );
+void Serial::initialize( unsigned threads_count
+                       , unsigned use_numa_count
+                       , unsigned use_cores_per_numa
+                       , bool allow_asynchronous_threadpool )
+{
+  (void) threads_count;
+  (void) use_numa_count;
+  (void) use_cores_per_numa;
+  (void) allow_asynchronous_threadpool;

-  if ( ( s.m_reduce_end < reduce_size ) ||
-       ( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
+  // Init the array of locks used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}

-    if ( s.m_scratch ) { free( s.m_scratch ); }
+void Serial::finalize()
+{
+  if ( Impl::g_serial_thread_team_data.scratch_buffer() ) {
+    Impl::g_serial_thread_team_data.disband_team();
+    Impl::g_serial_thread_team_data.disband_pool();

-    if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
-    if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
+    Kokkos::HostSpace space ;

-    s.m_scratch = malloc( s.m_shared_end );
+    space.deallocate( Impl::g_serial_thread_team_data.scratch_buffer()
+                    , Impl::g_serial_thread_team_data.scratch_bytes() );
+
+    Impl::g_serial_thread_team_data.scratch_assign( (void*) 0, 0, 0, 0, 0, 0 );
  }

-  return s.m_scratch ;
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
 }

 } // namespace Kokkos
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
@ -62,11 +62,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
  using execution_space = Kokkos::Serial ;
  using queue_type      = TaskQueue< execution_space > ;
  using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;

  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;

-  Member exec ;
+  Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
+
+  Member exec( *data );

  // Loop until all queues are empty
  while ( 0 < queue->m_ready_count ) {
@ -75,13 +77,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute

    for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
      for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-        task = queue_type::pop_task( & queue->m_ready[i][j] );
+        task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
      }
    }

    if ( end != task ) {

-      // pop_task resulted in lock == task->m_next
+      // pop_ready_task resulted in lock == task->m_next
      // In the executing state

      (*task->m_apply)( task , & exec );
@ -113,11 +115,13 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
  using execution_space = Kokkos::Serial ;
  using queue_type      = TaskQueue< execution_space > ;
  using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;

  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;

-  Member exec ;
+  Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
+
+  Member exec( *data );

  // Loop until no runnable task

@ -129,7 +133,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::

    for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
      for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-        task = queue_type::pop_task( & queue->m_ready[i][j] );
+        task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
      }
    }

--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@ -65,6 +65,7 @@ public:
  using memory_space    = Kokkos::HostSpace ;
  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;

  static
  void iff_single_thread_recursive_execute( queue_type * const );
@ -72,237 +73,19 @@ public:
  static
  void execute( queue_type * const );

-  template< typename FunctorType >
+  template< typename TaskType >
  static
-  void proc_set_apply( task_base_type::function_type * ptr )
-    {
-      using TaskType = TaskBase< Kokkos::Serial
-                               , typename FunctorType::value_type
-                               , FunctorType
-                               > ;
-       *ptr = TaskType::apply ;
-    }
+  typename TaskType::function_type
+  get_function_pointer() { return TaskType::apply ; }
 };

 extern template class TaskQueue< Kokkos::Serial > ;

-//----------------------------------------------------------------------------
-
-template<>
-class TaskExec< Kokkos::Serial >
-{
-public:
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
-};
-
-template<typename iType>
-struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
-{
-  typedef iType index_type;
-  const iType start ;
-  const iType end ;
-  enum {increment = 1};
-  //const  TaskExec< Kokkos::Serial > & thread;
-  TaskExec< Kokkos::Serial > & thread;
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct
-    //( const TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    : start(0)
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct
-    //( const TaskExec< Kokkos::Serial > & arg_thread
-    ( TaskExec< Kokkos::Serial > & arg_thread
-    , const iType& arg_start
-    , const iType & arg_end
-    )
-    : start( arg_start )
-    , end(   arg_end)
-    , thread( arg_thread )
-    {}
-};
-
-//----------------------------------------------------------------------------
-
-template<typename iType>
-struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
-{
-  typedef iType index_type;
-  const iType start ;
-  const iType end ;
-  enum {increment = 1};
-  TaskExec< Kokkos::Serial > & thread;
-
-  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct
-    ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    : start( 0 )
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-};
-
 }} /* namespace Kokkos::Impl */

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

-namespace Kokkos {
-
-// OMP version needs non-const TaskExec
-template< typename iType >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >
-TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >( thread, count );
-}
-
-// OMP version needs non-const TaskExec
-template< typename iType1, typename iType2 >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::TaskExec< Kokkos::Serial > >
-TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType1 & start, const iType2 & end )
-{
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >(
-           thread, iType(start), iType(end) );
-}
-
-// OMP version needs non-const TaskExec
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
-ThreadVectorRange
-  ( Impl::TaskExec< Kokkos::Serial > & thread
-  , const iType & count )
-{
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
-}
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i, result);
-
-  initialized_result = result;
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i, result);
-
-  initialized_result = result;
-}
-
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-  initialized_result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    initialized_result+=tmp;
-  }
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  ValueType result = initialized_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  initialized_result = result;
-}
-
-template< typename ValueType, typename iType, class Lambda >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda)
-{
-  ValueType accum = 0 ;
-  ValueType val, local_total;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    local_total = 0;
-    lambda(i,local_total,false);
-    val = accum;
-    lambda(i,val,true);
-    accum += local_total;
-  }
-
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda)
-{
-}
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */

--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp
@ -1,693 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_HPP
-#define KOKKOS_SYNCHRONIC_HPP
-
-#include <impl/Kokkos_Synchronic_Config.hpp>
-
-#include <atomic>
-#include <chrono>
-#include <thread>
-#include <functional>
-#include <algorithm>
-
-namespace Kokkos {
-namespace Impl {
-
-enum notify_hint {
-  notify_all,
-  notify_one,
-  notify_none
-};
-enum expect_hint {
-  expect_urgent,
-  expect_delay
-};
-
-namespace Details {
-
-template <class S, class T>
-bool __synchronic_spin_wait_for_update(S const& arg, T const& nval, int attempts) noexcept {
-  int i = 0;
-  for(;i < __SYNCHRONIC_SPIN_RELAX(attempts); ++i)
-    if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
-      return true;
-    else
-      __synchronic_relax();
-  for(;i < attempts; ++i)
-    if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
-      return true;
-    else
-      __synchronic_yield();
-  return false;
-}
-
-struct __exponential_backoff {
-  __exponential_backoff(int arg_maximum=512) : maximum(arg_maximum), microseconds(8), x(123456789), y(362436069), z(521288629) {
-  }
-  static inline void sleep_for(std::chrono::microseconds const& time) {
-    auto t = time.count();
-    if(__builtin_expect(t > 75,0)) {
-      portable_sleep(time);
-    }
-    else if(__builtin_expect(t > 25,0))
-      __synchronic_yield();
-    else
-      __synchronic_relax();
-  }
-  void sleep_for_step() {
-    sleep_for(step());
-  }
-  std::chrono::microseconds step() {
-    float const f = ranfu();
-    int const t = int(microseconds * f);
-    if(__builtin_expect(f >= 0.95f,0))
-      microseconds = 8;
-    else
-      microseconds = (std::min)(microseconds>>1,maximum);
-    return std::chrono::microseconds(t);
-  }
-private :
-  int maximum, microseconds, x, y, z;
-  int xorshf96() {
-    int t;
-    x ^= x << 16; x ^= x >> 5; x ^= x << 1;
-    t = x; x = y; y = z; z = t ^ x ^ y;
-    return z;
-  }
-  float ranfu() {
-    return (float)(xorshf96()&(~0UL>>1)) / (float)(~0UL>>1);
-  }
-};
-
-template <class T, class Enable = void>
-struct __synchronic_base {
-
-protected:
-  std::atomic<T> atom;
-
-  void notify(notify_hint = notify_all) noexcept {
-  }
-  void notify(notify_hint = notify_all) volatile noexcept {
-  }
-
-public :
-  __synchronic_base() noexcept = default;
-  constexpr __synchronic_base(T v) noexcept : atom(v) { }
-  __synchronic_base(const __synchronic_base&) = delete;
-  ~__synchronic_base() { }
-  __synchronic_base& operator=(const __synchronic_base&) = delete;
-  __synchronic_base& operator=(const __synchronic_base&) volatile = delete;
-
-  void expect_update(T val, expect_hint = expect_urgent) const noexcept {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    while(atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-    }
-  }
-  void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    while(atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-    }
-  }
-
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-};
-
-#ifdef __SYNCHRONIC_COMPATIBLE
-template <class T>
-struct __synchronic_base<T, typename std::enable_if<__SYNCHRONIC_COMPATIBLE(T)>::type> {
-
-public:
-  std::atomic<T> atom;
-
-  void notify(notify_hint hint = notify_all) noexcept {
-    if(__builtin_expect(hint == notify_none,1))
-      return;
-    auto const x = count.fetch_add(0,std::memory_order_acq_rel);
-    if(__builtin_expect(x,0)) {
-      if(__builtin_expect(hint == notify_all,1))
-        __synchronic_wake_all(&atom);
-      else
-        __synchronic_wake_one(&atom);
-    }
-  }
-  void notify(notify_hint hint = notify_all) volatile noexcept {
-    if(__builtin_expect(hint == notify_none,1))
-      return;
-    auto const x = count.fetch_add(0,std::memory_order_acq_rel);
-    if(__builtin_expect(x,0)) {
-      if(__builtin_expect(hint == notify_all,1))
-        __synchronic_wake_all_volatile(&atom);
-      else
-        __synchronic_wake_one_volatile(&atom);
-    }
-  }
-
-public :
-  __synchronic_base() noexcept : count(0) { }
-  constexpr __synchronic_base(T v) noexcept : atom(v), count(0) { }
-  __synchronic_base(const __synchronic_base&) = delete;
-  ~__synchronic_base() { }
-  __synchronic_base& operator=(const __synchronic_base&) = delete;
-  __synchronic_base& operator=(const __synchronic_base&) volatile = delete;
-
-  void expect_update(T val, expect_hint = expect_urgent) const noexcept {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait(&atom,val);
-      count.fetch_add(-1,std::memory_order_acquire);
-    }
-  }
-  void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_volatile(&atom,val);
-      count.fetch_add(-1,std::memory_order_acquire);
-    }
-  }
-
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_timed(&atom,val,remains);
-      count.fetch_add(-1,std::memory_order_acquire);
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_timed_volatile(&atom,val,remains);
-      count.fetch_add(-1,std::memory_order_acquire);
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-private:
-  mutable std::atomic<int> count;
-};
-#endif
-
-template <class T, class Enable = void>
-struct __synchronic : public __synchronic_base<T> {
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-};
-
-template <class T>
-struct __synchronic<T,typename std::enable_if<std::is_integral<T>::value>::type> : public __synchronic_base<T> {
-
-  T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_and(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_and(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_or(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_or(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_xor(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_xor(v,m);
-    this->notify(n);
-    return t;
-  }
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-
-  T operator=(T v) volatile noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T operator=(T v) noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T operator++(int) volatile noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T operator++(int) noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T operator--(int) volatile noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T operator--(int) noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T operator++() volatile noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T operator++() noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T operator--() volatile noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T operator--() noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T operator+=(T v) volatile noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T operator+=(T v) noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T operator-=(T v) volatile noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T operator-=(T v) noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T operator&=(T v) volatile noexcept {
-    auto const t = this->atom &= v;
-    this->notify();
-    return t;
-  }
-  T operator&=(T v) noexcept {
-    auto const t = this->atom &= v;
-    this->notify();
-    return t;
-  }
-  T operator|=(T v) volatile noexcept {
-    auto const t = this->atom |= v;
-    this->notify();
-    return t;
-  }
-  T operator|=(T v) noexcept {
-    auto const t = this->atom |= v;
-    this->notify();
-    return t;
-  }
-  T operator^=(T v) volatile noexcept {
-    auto const t = this->atom ^= v;
-    this->notify();
-    return t;
-  }
-  T operator^=(T v) noexcept {
-    auto const t = this->atom ^= v;
-    this->notify();
-    return t;
-  }
-};
-
-template <class T>
-struct __synchronic<T*> : public __synchronic_base<T*> {
-
-  T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T* v) noexcept : __synchronic_base<T*>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-
-  T* operator=(T* v) volatile noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T* operator=(T* v) noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T* operator++(int) volatile noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator++(int) noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator--(int) volatile noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator--(int) noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator++() volatile noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T* operator++() noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T* operator--() volatile noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T* operator--() noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T* operator+=(ptrdiff_t v) volatile noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T* operator+=(ptrdiff_t v) noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T* operator-=(ptrdiff_t v) volatile noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T* operator-=(ptrdiff_t v) noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-};
-
-} //namespace Details
-
-template <class T>
-struct synchronic : public Details::__synchronic<T> {
-
-  bool is_lock_free() const volatile noexcept { return this->atom.is_lock_free(); }
-  bool is_lock_free() const noexcept { return this->atom.is_lock_free(); }
-  void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    this->atom.store(v,m);
-    this->notify(n);
-  }
-  void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    this->atom.store(v,m);
-    this->notify(n);
-  }
-  T load(std::memory_order m = std::memory_order_seq_cst) const volatile noexcept { return this->atom.load(m); }
-  T load(std::memory_order m = std::memory_order_seq_cst) const noexcept { return this->atom.load(m); }
-
-  operator T() const volatile noexcept { return (T)this->atom; }
-  operator T() const noexcept { return (T)this->atom; }
-
-  T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.exchange(v,m);
-    this->notify(n);
-    return t;
-  }
-  T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.exchange(v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m1, m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m);
-    this->notify(n);
-    return t;
-  }
-
-  synchronic() noexcept = default;
-  constexpr synchronic(T val) noexcept : Details::__synchronic<T>(val) { }
-  synchronic(const synchronic&) = delete;
-  ~synchronic() { }
-  synchronic& operator=(const synchronic&) = delete;
-  synchronic& operator=(const synchronic&) volatile = delete;
-  T operator=(T val) noexcept {
-    return Details::__synchronic<T>::operator=(val);
-  }
-  T operator=(T val) volatile noexcept {
-    return Details::__synchronic<T>::operator=(val);
-  }
-
-  T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
-    Details::__synchronic<T>::expect_update(val,h);
-    return load(order);
-  }
-  T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
-    Details::__synchronic<T>::expect_update(val,h);
-    return load(order);
-  }
-  T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
-    for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
-      Details::__synchronic<T>::expect_update(nval,h);
-    return load(order);
-  }
-  T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
-    for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
-      expect_update(nval,h);
-    return load(order);
-  }
-  template <class Rep, class Period>
-  void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const {
-    Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
-  }
-  template < class Rep, class Period>
-  void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const volatile {
-    Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
-  }
-};
-
-#include <inttypes.h>
-
-typedef synchronic<char> synchronic_char;
-typedef synchronic<char> synchronic_schar;
-typedef synchronic<unsigned char> synchronic_uchar;
-typedef synchronic<short> synchronic_short;
-typedef synchronic<unsigned short> synchronic_ushort;
-typedef synchronic<int> synchronic_int;
-typedef synchronic<unsigned int> synchronic_uint;
-typedef synchronic<long> synchronic_long;
-typedef synchronic<unsigned long> synchronic_ulong;
-typedef synchronic<long long> synchronic_llong;
-typedef synchronic<unsigned long long> synchronic_ullong;
-//typedef synchronic<char16_t> synchronic_char16_t;
-//typedef synchronic<char32_t> synchronic_char32_t;
-typedef synchronic<wchar_t> synchronic_wchar_t;
-
-typedef synchronic<int_least8_t> synchronic_int_least8_t;
-typedef synchronic<uint_least8_t> synchronic_uint_least8_t;
-typedef synchronic<int_least16_t> synchronic_int_least16_t;
-typedef synchronic<uint_least16_t> synchronic_uint_least16_t;
-typedef synchronic<int_least32_t> synchronic_int_least32_t;
-typedef synchronic<uint_least32_t> synchronic_uint_least32_t;
-//typedef synchronic<int_least_64_t> synchronic_int_least_64_t;
-typedef synchronic<uint_least64_t> synchronic_uint_least64_t;
-typedef synchronic<int_fast8_t> synchronic_int_fast8_t;
-typedef synchronic<uint_fast8_t> synchronic_uint_fast8_t;
-typedef synchronic<int_fast16_t> synchronic_int_fast16_t;
-typedef synchronic<uint_fast16_t> synchronic_uint_fast16_t;
-typedef synchronic<int_fast32_t> synchronic_int_fast32_t;
-typedef synchronic<uint_fast32_t> synchronic_uint_fast32_t;
-typedef synchronic<int_fast64_t> synchronic_int_fast64_t;
-typedef synchronic<uint_fast64_t> synchronic_uint_fast64_t;
-typedef synchronic<intptr_t> synchronic_intptr_t;
-typedef synchronic<uintptr_t> synchronic_uintptr_t;
-typedef synchronic<size_t> synchronic_size_t;
-typedef synchronic<ptrdiff_t> synchronic_ptrdiff_t;
-typedef synchronic<intmax_t> synchronic_intmax_t;
-typedef synchronic<uintmax_t> synchronic_uintmax_t;
-
-}
-}
-
-#endif //__SYNCHRONIC_H
--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp
@ -1,169 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_CONFIG_H
-#define KOKKOS_SYNCHRONIC_CONFIG_H
-
-#include <thread>
-#include <chrono>
-
-namespace Kokkos {
-namespace Impl {
-
-//the default yield function used inside the implementation is the Standard one
-#define __synchronic_yield std::this_thread::yield
-#define __synchronic_relax __synchronic_yield
-
-#if defined(_MSC_VER)
-    //this is a handy GCC optimization that I use inside the implementation
-    #define __builtin_expect(condition,common) condition
-    #if _MSC_VER <= 1800
-        //using certain keywords that VC++ temporarily doesn't support
-        #define _ALLOW_KEYWORD_MACROS
-        #define noexcept
-        #define constexpr
-    #endif
-    //yes, I define multiple assignment operators
-    #pragma warning(disable:4522)
-    //I don't understand how Windows is so bad at timing functions, but is OK
-    //with straight-up yield loops
-    #define __do_backoff(b) __synchronic_yield()
-#else
-#define __do_backoff(b) b.sleep_for_step()
-#endif
-
-//certain platforms have efficient support for spin-waiting built into the operating system
-#if defined(__linux__) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0602)
-#if defined(_WIN32_WINNT)
-#include <winsock2.h>
-#include <Windows.h>
-    //the combination of WaitOnAddress and WakeByAddressAll is supported on Windows 8.1+
-    #define __synchronic_wait(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
-    #define __synchronic_wait_timed(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
-    #define __synchronic_wake_one(x) WakeByAddressSingle((PVOID)x)
-    #define __synchronic_wake_all(x) WakeByAddressAll((PVOID)x)
-    #define __synchronic_wait_volatile(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
-    #define __synchronic_wait_timed_volatile(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
-    #define __synchronic_wake_one_volatile(x) WakeByAddressSingle((PVOID)x)
-    #define __synchronic_wake_all_volatile(x) WakeByAddressAll((PVOID)x)
-    #define __SYNCHRONIC_COMPATIBLE(x) (std::is_pod<x>::value && (sizeof(x) <= 8))
-
-    inline void native_sleep(unsigned long microseconds)
-    {
-      // What to do if microseconds is < 1000?
-      Sleep(microseconds / 1000);
-    }
-
-    inline void native_yield()
-    {
-      SwitchToThread();
-    }
-#elif defined(__linux__)
-    #include <chrono>
-    #include <time.h>
-    #include <unistd.h>
-    #include <pthread.h>
-    #include <linux/futex.h>
-    #include <sys/syscall.h>
-    #include <climits>
-    #include <cassert>
-    template < class Rep, class Period>
-    inline timespec to_timespec(std::chrono::duration<Rep,Period> const& delta) {
-      struct timespec ts;
-      ts.tv_sec = static_cast<long>(std::chrono::duration_cast<std::chrono::seconds>(delta).count());
-      assert(!ts.tv_sec);
-      ts.tv_nsec = static_cast<long>(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count());
-      return ts;
-    }
-    inline long futex(void const* addr1, int op, int val1) {
-        return syscall(SYS_futex, addr1, op, val1, 0, 0, 0);
-    }
-    inline long futex(void const* addr1, int op, int val1, struct timespec timeout) {
-        return syscall(SYS_futex, addr1, op, val1, &timeout, 0, 0);
-    }
-    inline void native_sleep(unsigned long microseconds)
-    {
-      usleep(microseconds);
-    }
-    inline void native_yield()
-    {
-      pthread_yield();
-    }
-
-    //the combination of SYS_futex(WAIT) and SYS_futex(WAKE) is supported on all recent Linux distributions
-    #define __synchronic_wait(x,v) futex(x, FUTEX_WAIT_PRIVATE, v)
-    #define __synchronic_wait_timed(x,v,t) futex(x, FUTEX_WAIT_PRIVATE, v, to_timespec(t))
-    #define __synchronic_wake_one(x) futex(x, FUTEX_WAKE_PRIVATE, 1)
-    #define __synchronic_wake_all(x) futex(x, FUTEX_WAKE_PRIVATE, INT_MAX)
-    #define __synchronic_wait_volatile(x,v) futex(x, FUTEX_WAIT, v)
-    #define __synchronic_wait_volatile_timed(x,v,t) futex(x, FUTEX_WAIT, v, to_timespec(t))
-    #define __synchronic_wake_one_volatile(x) futex(x, FUTEX_WAKE, 1)
-    #define __synchronic_wake_all_volatile(x) futex(x, FUTEX_WAKE, INT_MAX)
-    #define __SYNCHRONIC_COMPATIBLE(x) (std::is_integral<x>::value && (sizeof(x) <= 4))
-
-    //the yield function on Linux is better replaced by sched_yield, which is tuned for spin-waiting
-    #undef __synchronic_yield
-    #define __synchronic_yield sched_yield
-
-    //for extremely short wait times, just let another hyper-thread run
-    #undef __synchronic_relax
-    #define __synchronic_relax() asm volatile("rep; nop" ::: "memory")
-
-#endif
-#endif
-
-#ifdef _GLIBCXX_USE_NANOSLEEP
-inline void portable_sleep(std::chrono::microseconds const& time)
-{ std::this_thread::sleep_for(time); }
-#else
-inline void portable_sleep(std::chrono::microseconds const& time)
-{ native_sleep(time.count()); }
-#endif
-
-#ifdef _GLIBCXX_USE_SCHED_YIELD
-inline void portable_yield()
-{ std::this_thread::yield(); }
-#else
-inline void portable_yield()
-{ native_yield(); }
-#endif
-
-//this is the number of times we initially spin, on the first wait attempt
-#define __SYNCHRONIC_SPIN_COUNT_A 16
-
-//this is how decide to yield instead of just spinning, 'c' is the current trip count
-//#define __SYNCHRONIC_SPIN_YIELD(c) true
-#define __SYNCHRONIC_SPIN_RELAX(c) (c>>3)
-
-//this is the number of times we normally spin, on every subsequent wait attempt
-#define __SYNCHRONIC_SPIN_COUNT_B 8
-
-}
-}
-
-#endif //__SYNCHRONIC_CONFIG_H
--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp
@ -1,162 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_N3998_HPP
-#define KOKKOS_SYNCHRONIC_N3998_HPP
-
-#include <impl/Kokkos_Synchronic.hpp>
-#include <functional>
-
-/*
-In the section below, a synchronization point represents a point at which a
-thread may block until a given synchronization condition has been reached or
-at which it may notify other threads that a synchronization condition has
-been achieved.
-*/
-namespace Kokkos { namespace Impl {
-
-    /*
-    A latch maintains an internal counter that is initialized when the latch
-    is created. The synchronization condition is reached when the counter is
-    decremented to 0. Threads may block at a synchronization point waiting
-    for the condition to be reached. When the condition is reached, any such
-    blocked threads will be released.
-    */
-    struct latch {
-        latch(int val) : count(val), released(false) { }
-        latch(const latch&) = delete;
-        latch& operator=(const latch&) = delete;
-        ~latch( ) { }
-        void arrive( ) {
-            __arrive( );
-        }
-        void arrive_and_wait( ) {
-            if(!__arrive( ))
-                wait( );
-        }
-        void wait( ) {
-            while(!released.load_when_not_equal(false,std::memory_order_acquire))
-                ;
-        }
-        bool try_wait( ) {
-            return released.load(std::memory_order_acquire);
-        }
-    private:
-        bool __arrive( ) {
-            if(count.fetch_add(-1,std::memory_order_release)!=1)
-                return false;
-            released.store(true,std::memory_order_release);
-            return true;
-        }
-        std::atomic<int> count;
-        synchronic<bool> released;
-    };
-
-    /*
-    A barrier is created with an initial value representing the number of threads
-    that can arrive at the synchronization point. When that many threads have
-    arrived, the  synchronization condition is reached and the threads are
-    released. The barrier will then reset, and may be reused for a new cycle, in
-    which the same set of threads may arrive again at the synchronization point.
-    The same set of threads shall arrive at the barrier in each cycle, otherwise
-    the behaviour is undefined.
-    */
-    struct barrier {
-        barrier(int val) : expected(val), arrived(0), nexpected(val), epoch(0) { }
-        barrier(const barrier&) = delete;
-        barrier& operator=(const barrier&) = delete;
-        ~barrier() { }
-        void arrive_and_wait() {
-            int const myepoch = epoch.load(std::memory_order_relaxed);
-            if(!__arrive(myepoch))
-                while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
-                    ;
-        }
-        void arrive_and_drop() {
-            nexpected.fetch_add(-1,std::memory_order_relaxed);
-            __arrive(epoch.load(std::memory_order_relaxed));
-        }
-    private:
-        bool __arrive(int const myepoch) {
-            int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
-            if(__builtin_expect(myresult == expected,0)) {
-                expected = nexpected.load(std::memory_order_relaxed);
-                arrived.store(0,std::memory_order_relaxed);
-                epoch.store(myepoch+1,std::memory_order_release);
-                return true;
-            }
-            return false;
-        }
-        int expected;
-        std::atomic<int> arrived, nexpected;
-        synchronic<int> epoch;
-    };
-
-    /*
-    A notifying barrier behaves as a barrier, but is constructed with a callable
-    completion function that is invoked after all threads have arrived at the
-    synchronization point, and before the synchronization condition is reached.
-    The completion may modify the set of threads that arrives at the barrier in
-    each cycle.
-    */
-    struct notifying_barrier {
-        template <typename T>
-        notifying_barrier(int val, T && f) : expected(val), arrived(0), nexpected(val), epoch(0), completion(std::forward<T>(f)) { }
-        notifying_barrier(const notifying_barrier&) = delete;
-        notifying_barrier& operator=(const notifying_barrier&) = delete;
-        ~notifying_barrier( ) { }
-        void arrive_and_wait() {
-            int const myepoch = epoch.load(std::memory_order_relaxed);
-            if(!__arrive(myepoch))
-                while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
-                    ;
-        }
-        void arrive_and_drop() {
-            nexpected.fetch_add(-1,std::memory_order_relaxed);
-            __arrive(epoch.load(std::memory_order_relaxed));
-        }
-    private:
-        bool __arrive(int const myepoch) {
-            int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
-            if(__builtin_expect(myresult == expected,0)) {
-                int const newexpected = completion();
-                expected = newexpected ? newexpected : nexpected.load(std::memory_order_relaxed);
-                arrived.store(0,std::memory_order_relaxed);
-                epoch.store(myepoch+1,std::memory_order_release);
-                return true;
-            }
-            return false;
-        }
-        int expected;
-        std::atomic<int> arrived, nexpected;
-        synchronic<int> epoch;
-        std::function<int()> completion;
-    };
-}}
-
-#endif //__N3998_H
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@ -76,9 +76,6 @@ namespace Impl {
 template< typename Space , typename ResultType , typename FunctorType >
 class TaskBase ;

-template< typename Space >
-class TaskExec ;
-
 } /* namespace Impl */
 } /* namespace Kokkos */

@ -149,8 +146,8 @@ private:
  //     task->m_next is the dependence or zero
  //   Postcondition:
  //     task->m_next is linked list membership
-  KOKKOS_FUNCTION
-  void schedule( task_root_type * const );
+  KOKKOS_FUNCTION void schedule_runnable(  task_root_type * const );
+  KOKKOS_FUNCTION void schedule_aggregate( task_root_type * const );

  // Reschedule a task
  //   Precondition:
@ -178,7 +175,7 @@ private:
                       , task_root_type * const );

  KOKKOS_FUNCTION
-  static task_root_type * pop_task( task_root_type * volatile * const );
+  static task_root_type * pop_ready_task( task_root_type * volatile * const );

  KOKKOS_FUNCTION static
  void decrement( task_root_type * task );
@ -368,6 +365,7 @@ public:
  int16_t        m_task_type ;   ///< Type of task
  int16_t        m_priority ;    ///< Priority of runnable task

+  TaskBase() = delete ;
  TaskBase( TaskBase && ) = delete ;
  TaskBase( const TaskBase & ) = delete ;
  TaskBase & operator = ( TaskBase && ) = delete ;
@ -375,17 +373,43 @@ public:

  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;

+  // Constructor for a runnable task
  KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase() noexcept
-    : m_apply(0)
-    , m_queue(0)
-    , m_wait(0)
-    , m_next(0)
-    , m_ref_count(0)
-    , m_alloc_size(0)
-    , m_dep_count(0)
-    , m_task_type( TaskSingle )
-    , m_priority( 1 /* TaskRegularPriority */ )
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , TaskBase    * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    ) noexcept
+    : m_apply(      arg_apply )
+    , m_queue(      arg_queue )
+    , m_wait( 0 )
+    , m_next(       arg_dependence )
+    , m_ref_count(  arg_ref_count )
+    , m_alloc_size( arg_alloc_size )
+    , m_dep_count( 0 )
+    , m_task_type(  arg_task_type )
+    , m_priority(   arg_priority )
+    {}
+
+  // Constructor for an aggregate task
+  KOKKOS_INLINE_FUNCTION
+  constexpr TaskBase( queue_type  * arg_queue
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_dep_count
+                    ) noexcept
+    : m_apply( 0 )
+    , m_queue( arg_queue )
+    , m_wait( 0 )
+    , m_next( 0 )
+    , m_ref_count(  arg_ref_count )
+    , m_alloc_size( arg_alloc_size )
+    , m_dep_count(  arg_dep_count )
+    , m_task_type(  Aggregate )
+    , m_priority( 0 )
    {}

  //----------------------------------------
@ -406,9 +430,13 @@ public:
  KOKKOS_INLINE_FUNCTION
  void add_dependence( TaskBase* dep )
    {
+      // Precondition: lock == m_next
+
+      TaskBase * const lock = (TaskBase *) LockTag ;
+
      // Assign dependence to m_next.  It will be processed in the subsequent
      // call to schedule.  Error if the dependence is reset.
-      if ( 0 != Kokkos::atomic_exchange( & m_next, dep ) ) {
+      if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
      }

@ -431,8 +459,13 @@ class TaskBase< ExecSpace , ResultType , void >
 {
 private:

-  static_assert( sizeof(TaskBase<ExecSpace,void,void>) == 48 , "" );
+  using root_type     = TaskBase<ExecSpace,void,void> ;
+  using function_type = typename root_type::function_type ;
+  using queue_type    = typename root_type::queue_type ;

+  static_assert( sizeof(root_type) == 48 , "" );
+
+  TaskBase() = delete ;
  TaskBase( TaskBase && ) = delete ;
  TaskBase( const TaskBase & ) = delete ;
  TaskBase & operator = ( TaskBase && ) = delete ;
@ -444,9 +477,24 @@ public:

  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;

+  // Constructor for runnable task
  KOKKOS_INLINE_FUNCTION
-  TaskBase()
-    : TaskBase< ExecSpace , void , void >()
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , root_type   * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    )
+    : root_type( arg_apply 
+               , arg_queue
+               , arg_dependence
+               , arg_ref_count
+               , arg_alloc_size
+               , arg_task_type
+               , arg_priority
+               )
    , m_result()
    {}

@ -471,11 +519,14 @@ private:

 public:

-  using root_type    = TaskBase< ExecSpace , void , void > ;
-  using base_type    = TaskBase< ExecSpace , ResultType , void > ;
-  using member_type  = TaskExec< ExecSpace > ;
-  using functor_type = FunctorType ;
-  using result_type  = ResultType ;
+  using root_type       = TaskBase< ExecSpace , void , void > ;
+  using base_type       = TaskBase< ExecSpace , ResultType , void > ;
+  using specialization  = TaskQueueSpecialization< ExecSpace > ;
+  using function_type   = typename root_type::function_type ;
+  using queue_type      = typename root_type::queue_type ;
+  using member_type     = typename specialization::member_type ;
+  using functor_type    = FunctorType ;
+  using result_type     = ResultType ;

  template< typename Type >
  KOKKOS_INLINE_FUNCTION static
@ -522,13 +573,30 @@ public:
      if ( 0 == member->team_rank() && !(task->requested_respawn()) ) {
        // Did not respawn, destroy the functor to free memory.
        static_cast<functor_type*>(task)->~functor_type();
-        // Cannot destroy the task until its dependences have been processed.
+        // Cannot destroy and deallocate the task until its dependences
+        // have been processed.
      }
    }

+  // Constructor for runnable task
  KOKKOS_INLINE_FUNCTION
-  TaskBase( functor_type const & arg_functor )
-    : base_type()
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , root_type   * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    , FunctorType && arg_functor
+                    )
+    : base_type( arg_apply 
+               , arg_queue
+               , arg_dependence
+               , arg_ref_count
+               , arg_alloc_size
+               , arg_task_type
+               , arg_priority
+               )
    , functor_type( arg_functor )
    {}

--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@ -170,6 +170,7 @@ bool TaskQueue< ExecSpace >::push_task
  )
 {
  // Push task into a concurrently pushed and popped queue.
+  // The queue can be either a ready task queue or a waiting task queue.
  // The queue is a linked list where 'task->m_next' form the links.
  // Fail the push attempt if the queue is locked;
  // otherwise retry until the push succeeds.
@ -227,13 +228,12 @@ bool TaskQueue< ExecSpace >::push_task
 template< typename ExecSpace >
 KOKKOS_FUNCTION
 typename TaskQueue< ExecSpace >::task_root_type *
-TaskQueue< ExecSpace >::pop_task
+TaskQueue< ExecSpace >::pop_ready_task
  ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue )
 {
-  // Pop task from a concurrently pushed and popped queue.
+  // Pop task from a concurrently pushed and popped ready task queue.
  // The queue is a linked list where 'task->m_next' form the links.

-  task_root_type * const zero = (task_root_type *) 0 ;
  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;

@ -252,85 +252,201 @@ TaskQueue< ExecSpace >::pop_task
    // (1) lock, (2) end, or (3) a valid task.
    // Thus zero will never appear in the queue.
    //
-    // If queue is locked then just read by guaranteeing
-    // the CAS will fail.
+    // If queue is locked then just read by guaranteeing the CAS will fail.

    if ( lock == task ) task = 0 ;

    task_root_type * const x = task ;

-    task = Kokkos::atomic_compare_exchange(queue,task,lock);
+    task = Kokkos::atomic_compare_exchange(queue,x,lock);

-    if ( x == task ) break ; // CAS succeeded and queue is locked
-  }
+    if ( x == task ) {
+      // CAS succeeded and queue is locked
+      //
+      // This thread has locked the queue and removed 'task' from the queue.
+      // Extract the next entry of the queue from 'task->m_next'
+      // and mark 'task' as popped from a queue by setting
+      // 'task->m_next = lock'.
+      //
+      // Place the next entry in the head of the queue,
+      // which also unlocks the queue.
+      //
+      // This thread has exclusive access to
+      // the queue and the popped task's m_next.

-  if ( end != task ) {
+      *queue = task->m_next ; task->m_next = lock ;

-    // This thread has locked the queue and removed 'task' from the queue.
-    // Extract the next entry of the queue from 'task->m_next'
-    // and mark 'task' as popped from a queue by setting
-    // 'task->m_next = lock'.
+      Kokkos::memory_fence();

-    task_root_type * const next =
-      Kokkos::atomic_exchange( & task->m_next , lock );
+#if 0
+      printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
+            , uintptr_t(queue)
+            , uintptr_t(task)
+            , uintptr_t(task->m_wait)
+            , uintptr_t(task->m_next)
+            , int(task->m_task_type)
+            , int(task->m_priority)
+            , int(task->m_ref_count) );
+#endif

-    // Place the next entry in the head of the queue,
-    // which also unlocks the queue.
-
-    task_root_type * const unlock =
-      Kokkos::atomic_exchange( queue , next );
-
-    if ( next == zero || next == lock || lock != unlock ) {
-      Kokkos::abort("TaskQueue::pop_task ERROR");
+      return task ;
    }
  }

-#if 0
-  if ( end != task ) {
-    printf( "pop_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
-          , uintptr_t(queue)
-          , uintptr_t(task)
-          , uintptr_t(task->m_wait)
-          , uintptr_t(task->m_next)
-          , int(task->m_task_type)
-          , int(task->m_priority)
-          , int(task->m_ref_count) );
-  }
-#endif
-
-  return task ;
+  return end ;
 }

 //----------------------------------------------------------------------------

 template< typename ExecSpace >
 KOKKOS_FUNCTION
-void TaskQueue< ExecSpace >::schedule
+void TaskQueue< ExecSpace >::schedule_runnable
  ( TaskQueue< ExecSpace >::task_root_type * const task )
 {
-  // Schedule a runnable or when_all task upon construction / spawn
+  // Schedule a runnable task upon construction / spawn
  // and upon completion of other tasks that 'task' is waiting on.
-
-  // Precondition on runnable task state:
-  //   task is either constructing or executing
+  //
+  // Precondition:
+  // - called by a single thread for the input task
+  // - calling thread has exclusive access to the task
+  // - task is not a member of a queue
+  // - if runnable then task is either constructing or respawning
  //
  //   Constructing state:
  //     task->m_wait == 0
-  //     task->m_next == dependence
-  //   Executing-respawn state:
-  //     task->m_wait == head of linked list
-  //     task->m_next == dependence
+  //     task->m_next == dependence or 0
+  //   Respawn state:
+  //     task->m_wait == head of linked list: 'end' or valid task
+  //     task->m_next == dependence or 0
  //
  //  Task state transition:
-  //     Constructing      ->  Waiting
-  //     Executing-respawn ->  Waiting
+  //     Constructing ->  Waiting
+  //     Respawn      ->  Waiting
  //
  //  Postcondition on task state:
-  //     task->m_wait == head of linked list
-  //     task->m_next == member of linked list
+  //     task->m_wait == head of linked list (queue)
+  //     task->m_next == member of linked list (queue)

 #if 0
-  printf( "schedule( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+  printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+  bool respawn = false ;
+
+  //----------------------------------------
+
+  if ( zero == task->m_wait ) {
+    // Task in Constructing state
+    // - Transition to Waiting state
+    // Preconditions:
+    // - call occurs exclusively within a single thread
+
+    task->m_wait = end ;
+    // Task in Waiting state
+  }
+  else if ( lock != task->m_wait ) {
+    // Task in Executing state with Respawn request
+    // - Update dependence
+    // - Transition to Waiting state
+    respawn = true ;
+  }
+  else {
+    // Task in Complete state
+    Kokkos::abort("TaskQueue::schedule_runnable ERROR: task is complete");
+  }
+
+  //----------------------------------------
+  // Scheduling a runnable task which may have a depencency 'dep'.
+  // Extract dependence, if any, from task->m_next.
+  // If 'dep' is not null then attempt to push 'task'
+  // into the wait queue of 'dep'.
+  // If the push succeeds then 'task' may be
+  // processed or executed by another thread at any time.
+  // If the push fails then 'dep' is complete and 'task'
+  // is ready to execute.
+
+  // Exclusive access so don't need an atomic exchange
+  // task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
+  task_root_type * dep = task->m_next ; task->m_next = zero ;
+
+  const bool is_ready = 
+    ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
+
+  if ( ( 0 != dep ) && respawn ) {
+    // Reference count for dep was incremented when
+    // respawn assigned dependency to task->m_next
+    // so that if dep completed prior to the
+    // above push_task dep would not be destroyed.
+    // dep reference count can now be decremented,
+    // which may deallocate the task.
+    TaskQueue::assign( & dep , (task_root_type *)0 );
+  }
+
+  if ( is_ready ) {
+
+    // No dependence or 'dep' is complete so push task into ready queue.
+    // Increment the ready count before pushing into ready queue
+    // to track number of ready + executing tasks.
+    // The ready count will be decremented when the task is complete.
+
+    Kokkos::atomic_increment( & m_ready_count );
+
+    task_root_type * volatile * const ready_queue =
+      & m_ready[ task->m_priority ][ task->m_task_type ];
+
+    // A push_task fails if the ready queue is locked.
+    // A ready queue is only locked during a push or pop;
+    // i.e., it is never permanently locked.
+    // Retry push to ready queue until it succeeds.
+    // When the push succeeds then 'task' may be
+    // processed or executed by another thread at any time.
+
+    while ( ! push_task( ready_queue , task ) );
+  }
+
+  //----------------------------------------
+  // Postcondition:
+  // - A runnable 'task' was pushed into a wait or ready queue.
+  // - Concurrent execution may have already popped 'task'
+  //   from a queue and processed it as appropriate.
+}
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule_aggregate
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+  // Schedule an aggregate task upon construction
+  // and upon completion of other tasks that 'task' is waiting on.
+  //
+  // Precondition:
+  // - called by a single thread for the input task
+  // - calling thread has exclusive access to the task
+  // - task is not a member of a queue
+  //
+  //   Constructing state:
+  //     task->m_wait == 0
+  //     task->m_next == dependence or 0
+  //
+  //  Task state transition:
+  //     Constructing ->  Waiting
+  //
+  //  Postcondition on task state:
+  //     task->m_wait == head of linked list (queue)
+  //     task->m_next == member of linked list (queue)
+
+#if 0
+  printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
        , uintptr_t(task)
        , uintptr_t(task->m_wait)
        , uintptr_t(task->m_next)
@ -344,134 +460,85 @@ void TaskQueue< ExecSpace >::schedule
  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;

  //----------------------------------------
-  {
-    // If Constructing then task->m_wait == 0
-    // Change to waiting by task->m_wait = EndTag

-    task_root_type * const init =
-      Kokkos::atomic_compare_exchange( & task->m_wait , zero , end );
+  if ( zero == task->m_wait ) {
+    // Task in Constructing state
+    // - Transition to Waiting state
+    // Preconditions:
+    // - call occurs exclusively within a single thread

-    // Precondition
-
-    if ( lock == init ) {
-      Kokkos::abort("TaskQueue::schedule ERROR: task is complete");
-    }
-
-    // if ( init == 0 ) Constructing       ->  Waiting
-    // else             Executing-Respawn  ->  Waiting
+    task->m_wait = end ;
+    // Task in Waiting state
  }
+  else if ( lock == task->m_wait ) {
+    // Task in Complete state
+    Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
+  }
+
  //----------------------------------------
+  // Scheduling a 'when_all' task with multiple dependences.
+  // This scheduling may be called when the 'when_all' is
+  // (1) created or
+  // (2) being removed from a completed task's wait list.

-  if ( task_root_type::Aggregate != task->m_task_type ) {
+  task_root_type ** const aggr = task->aggregate_dependences();

-    // Scheduling a runnable task which may have a depencency 'dep'.
-    // Extract dependence, if any, from task->m_next.
-    // If 'dep' is not null then attempt to push 'task'
-    // into the wait queue of 'dep'.
-    // If the push succeeds then 'task' may be
-    // processed or executed by another thread at any time.
-    // If the push fails then 'dep' is complete and 'task'
-    // is ready to execute.
+  // Assume the 'when_all' is complete until a dependence is
+  // found that is not complete.

-    task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
+  bool is_complete = true ;

-    const bool is_ready =
-      ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
+  for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {

-    // Reference count for dep was incremented when assigned
-    // to task->m_next so that if it completed prior to the
-    // above push_task dep would not be destroyed.
-    // dep reference count can now be decremented,
-    // which may deallocate the task.
-    TaskQueue::assign( & dep , (task_root_type *)0 );
+    --i ;

-    if ( is_ready ) {
+    // Loop dependences looking for an incomplete task.
+    // Add this task to the incomplete task's wait queue.

-      // No dependence or 'dep' is complete so push task into ready queue.
-      // Increment the ready count before pushing into ready queue
-      // to track number of ready + executing tasks.
-      // The ready count will be decremented when the task is complete.
+    // Remove a task 'x' from the dependence list.
+    // The reference count of 'x' was incremented when
+    // it was assigned into the dependence list.

-      Kokkos::atomic_increment( & m_ready_count );
+    // Exclusive access so don't need an atomic exchange
+    // task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
+    task_root_type * x = aggr[i] ; aggr[i] = zero ;

-      task_root_type * volatile * const queue =
-        & m_ready[ task->m_priority ][ task->m_task_type ];
+    if ( x ) {

-      // A push_task fails if the ready queue is locked.
-      // A ready queue is only locked during a push or pop;
-      // i.e., it is never permanently locked.
-      // Retry push to ready queue until it succeeds.
-      // When the push succeeds then 'task' may be
-      // processed or executed by another thread at any time.
+      // If x->m_wait is not locked then push succeeds
+      // and the aggregate is not complete.
+      // If the push succeeds then this when_all 'task' may be
+      // processed by another thread at any time.
+      // For example, 'x' may be completeed by another
+      // thread and then re-schedule this when_all 'task'.

-      while ( ! push_task( queue , task ) );
+      is_complete = ! push_task( & x->m_wait , task );
+
+      // Decrement reference count which had been incremented
+      // when 'x' was added to the dependence list.
+
+      TaskQueue::assign( & x , zero );
    }
  }
-  //----------------------------------------
-  else {
-    // Scheduling a 'when_all' task with multiple dependences.
-    // This scheduling may be called when the 'when_all' is
-    // (1) created or
-    // (2) being removed from a completed task's wait list.

-    task_root_type ** const aggr = task->aggregate_dependences();
+  if ( is_complete ) {
+    // The when_all 'task' was not added to a wait queue because
+    // all dependences were complete so this aggregate is complete.
+    // Complete the when_all 'task' to schedule other tasks
+    // that are waiting for the when_all 'task' to complete.

-    // Assume the 'when_all' is complete until a dependence is
-    // found that is not complete.
+    task->m_next = lock ;

-    bool is_complete = true ;
+    complete( task );

-    for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
-
-      --i ;
-
-      // Loop dependences looking for an incomplete task.
-      // Add this task to the incomplete task's wait queue.
-
-      // Remove a task 'x' from the dependence list.
-      // The reference count of 'x' was incremented when
-      // it was assigned into the dependence list.
-
-      task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
-
-      if ( x ) {
-
-        // If x->m_wait is not locked then push succeeds
-        // and the aggregate is not complete.
-        // If the push succeeds then this when_all 'task' may be
-        // processed by another thread at any time.
-        // For example, 'x' may be completeed by another
-        // thread and then re-schedule this when_all 'task'.
-
-        is_complete = ! push_task( & x->m_wait , task );
-
-        // Decrement reference count which had been incremented
-        // when 'x' was added to the dependence list.
-
-        TaskQueue::assign( & x , zero );
-      }
-    }
-
-    if ( is_complete ) {
-      // The when_all 'task' was not added to a wait queue because
-      // all dependences were complete so this aggregate is complete.
-      // Complete the when_all 'task' to schedule other tasks
-      // that are waiting for the when_all 'task' to complete.
-
-      task->m_next = lock ;
-
-      complete( task );
-
-      // '*task' may have been deleted upon completion
-    }
+    // '*task' may have been deleted upon completion
  }
+
  //----------------------------------------
  // Postcondition:
-  //   A runnable 'task' was pushed into a wait or ready queue.
-  //   An aggregate 'task' was either pushed to a wait queue
-  //   or completed.
-  // Concurrent execution may have already popped 'task'
-  // from a queue and processed it as appropriate.
+  // - An aggregate 'task' was either pushed to a wait queue or completed.
+  // - Concurrent execution may have already popped 'task'
+  //   from a queue and processed it as appropriate.
 }

 //----------------------------------------------------------------------------
@ -529,7 +596,7 @@ void TaskQueue< ExecSpace >::complete
    // Is a runnable task has finished executing and requested respawn.
    // Schedule the task for subsequent execution.

-    schedule( task );
+    schedule_runnable( task );
  }
  //----------------------------------------
  else {
@ -556,18 +623,22 @@ void TaskQueue< ExecSpace >::complete
      TaskQueue::assign( & task , zero );

      // This thread has exclusive access to the wait list so
-      // the concurrency-safe pop_task function is not needed.
+      // the concurrency-safe pop_ready_task function is not needed.
      // Schedule the tasks that have been waiting on the input 'task',
      // which may have been deleted.

      while ( x != end ) {
+        // Have exclusive access to 'x' until it is scheduled
+        // Set x->m_next = zero  <=  no dependence, not a respawn

-        // Set x->m_next = zero  <=  no dependence
+        task_root_type * const next = x->m_next ; x->m_next = 0 ;

-        task_root_type * const next =
-          (task_root_type *) Kokkos::atomic_exchange( & x->m_next , zero );
-
-        schedule( x );
+        if ( task_root_type::Aggregate != x->m_task_type ) {
+          schedule_runnable( x );
+        }
+        else {
+          schedule_aggregate( x );
+        }

        x = next ;
      }
--- a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
@ -45,6 +45,7 @@
 #define KOKKOS_CORE_IMPL_UTILITIES_HPP

 #include <Kokkos_Macros.hpp>
+#include <stdint.h>
 #include <type_traits>

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,52 +36,144 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

 #include <Kokkos_Macros.hpp>
+
 #include <impl/Kokkos_spinwait.hpp>

+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+
 /*--------------------------------------------------------------------------*/

-#if ( KOKKOS_ENABLE_ASM )
-  #if defined( __arm__ ) || defined( __aarch64__ )
-    /* No-operation instruction to idle the thread. */
-    #define YIELD   asm volatile("nop")
+#if !defined( _WIN32 )
+  #if defined( KOKKOS_ENABLE_ASM )
+    #if defined( __arm__ ) || defined( __aarch64__ )
+      /* No-operation instruction to idle the thread. */
+      #define KOKKOS_INTERNAL_PAUSE
+    #else
+      /* Pause instruction to prevent excess processor bus usage */
+      #define KOKKOS_INTERNAL_PAUSE   asm volatile("pause\n":::"memory")
+    #endif
+    #define KOKKOS_INTERNAL_NOP2    asm volatile("nop\n" "nop\n")
+    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
+    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
+    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
+    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
+    namespace {
+    inline void kokkos_internal_yield( const unsigned i ) noexcept {
+      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
+      case 0u:  KOKKOS_INTERNAL_NOP2;  break;
+      case 1u:  KOKKOS_INTERNAL_NOP4;  break;
+      case 2u:  KOKKOS_INTERNAL_NOP8;  break;
+      case 3u:  KOKKOS_INTERNAL_NOP16; break;
+      default: KOKKOS_INTERNAL_NOP32;
+      }
+      KOKKOS_INTERNAL_PAUSE;
+    }
+    }
  #else
-    /* Pause instruction to prevent excess processor bus usage */
-    #define YIELD   asm volatile("pause\n":::"memory")
+    #include <sched.h>
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      sched_yield();
+    }
+    }
+  #endif
+#else // defined( _WIN32 )
+  #if defined ( KOKKOS_ENABLE_WINTHREAD )
+    #include <process.h>
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      Sleep(0);
+    }
+    }
+  #elif defined( _MSC_VER )
+    #define NOMINMAX
+    #include <winsock2.h>
+    #include <windows.h>
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      YieldProcessor();
+    }
+    }
+  #else
+    #define KOKKOS_INTERNAL_PAUSE   __asm__ __volatile__("pause\n":::"memory")
+    #define KOKKOS_INTERNAL_NOP2    __asm__ __volatile__("nop\n" "nop")
+    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
+    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
+    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
+    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
+    namespace {
+    inline void kokkos_internal_yield( const unsigned i ) noexcept {
+      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
+      case 0:  KOKKOS_INTERNAL_NOP2;  break;
+      case 1:  KOKKOS_INTERNAL_NOP4;  break;
+      case 2:  KOKKOS_INTERNAL_NOP8;  break;
+      case 3:  KOKKOS_INTERNAL_NOP16; break;
+      default: KOKKOS_INTERNAL_NOP32;
+      }
+      KOKKOS_INTERNAL_PAUSE;
+    }
+    }
  #endif
-#elif defined ( KOKKOS_ENABLE_WINTHREAD )
-  #include <process.h>
-  #define YIELD  Sleep(0)
-#elif defined ( _WIN32)  && defined (_MSC_VER)
-  /* Windows w/ Visual Studio */
-  #define NOMINMAX
-  #include <winsock2.h>
-  #include <windows.h>
-#define YIELD YieldProcessor();
-#elif defined ( _WIN32 )
-  /* Windows w/ Intel*/
-  #define YIELD __asm__ __volatile__("pause\n":::"memory")
-#else
-  #include <sched.h>
-  #define YIELD  sched_yield()
 #endif

+
 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
 namespace Impl {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void spinwait( volatile int & flag , const int value )
+
+void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
 {
+  Kokkos::store_fence();
+  unsigned i = 0;
  while ( value == flag ) {
-    YIELD ;
+    kokkos_internal_yield(i);
+    ++i;
  }
+  Kokkos::load_fence();
 }
+
+void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value != flag ) {
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
+void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value == flag ) {
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
+void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value != flag ) {
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
 #endif

 } /* namespace Impl */
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -47,14 +47,30 @@

 #include <Kokkos_Macros.hpp>

+#include <cstdint>
+
 namespace Kokkos {
 namespace Impl {

 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void spinwait( volatile int & flag , const int value );
+
+void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
+void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
+
+void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
+void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
 #else
+
 KOKKOS_INLINE_FUNCTION
-void spinwait( volatile int & , const int ) {}
+void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
+KOKKOS_INLINE_FUNCTION
+void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
+
+KOKKOS_INLINE_FUNCTION
+void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
+KOKKOS_INLINE_FUNCTION
+void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
+
 #endif

 } /* namespace Impl */