Updating Kokkos lib to v2.03.05

2017-06-08 10:42:08 -06:00
parent 286d4f2743
commit 5c1d17d1c0
474 changed files with 50972 additions and 10897 deletions
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
@ -44,14 +44,12 @@
 #ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
 #define KOKKOS_CUDA_EXP_ITERATE_TILE_HPP

+#include <Kokkos_Macros.hpp>
+#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
+
 #include <iostream>
 #include <algorithm>
-#include <stdio.h>
-
-#include <Kokkos_Macros.hpp>
-
-/* only compile this file if CUDA is enabled for Kokkos */
-#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
+#include <cstdio>

 #include <utility>

@ -1298,3 +1296,4 @@ protected:

 #endif
 #endif
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -45,11 +45,10 @@
 #define KOKKOS_CUDAEXEC_HPP

 #include <Kokkos_Macros.hpp>
-
-/* only compile this file if CUDA is enabled for Kokkos */
 #ifdef KOKKOS_ENABLE_CUDA

 #include <string>
+#include <cstdint>
 #include <Kokkos_Parallel.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Cuda/Kokkos_Cuda_abort.hpp>
@ -99,6 +98,8 @@ CudaSpace::size_type cuda_internal_maximum_warp_count();
 CudaSpace::size_type cuda_internal_maximum_grid_count();
 CudaSpace::size_type cuda_internal_maximum_shared_words();

+CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
+
 CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
 CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
 CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
@ -146,7 +147,7 @@ Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;

 namespace Kokkos {
 namespace Impl {
-  void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
+  void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
 }
 }

@ -319,3 +320,4 @@ struct CudaParallelLaunch< DriverType , false > {
 #endif /* defined( __CUDACC__ ) */
 #endif /* defined( KOKKOS_ENABLE_CUDA ) */
 #endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -41,16 +41,15 @@
 //@HEADER
 */

-#include <stdlib.h>
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <cstdlib>
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <algorithm>
 #include <atomic>
-#include <Kokkos_Macros.hpp>
-
-/* only compile this file if CUDA is enabled for Kokkos */
-#ifdef KOKKOS_ENABLE_CUDA

 #include <Kokkos_Core.hpp>
 #include <Kokkos_Cuda.hpp>
@ -890,9 +889,9 @@ void init_lock_arrays_cuda_space() {
  }
 }

-void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
+void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
  static void* ptr = NULL;
-  static size_t current_size = 0;
+  static std::int64_t current_size = 0;
  if(current_size == 0) {
    current_size = bytes;
    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
@ -911,5 +910,7 @@ void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {

 }
 }
+#else
+void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
 #endif // KOKKOS_ENABLE_CUDA

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -45,8 +45,6 @@
 #define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP

 #include <Kokkos_Macros.hpp>
-
-/* only compile this file if CUDA is enabled for Kokkos */
 #ifdef KOKKOS_ENABLE_CUDA

 #include <impl/Kokkos_Traits.hpp>
@ -75,7 +73,7 @@ shared_allocation_record( Kokkos::CudaSpace const & arg_space
  new( functor ) DestructFunctor( arg_destruct );

  record->m_destruct_functor = & shared_allocation_destroy< DestructFunctor > ;
-  
+
  return record ;
 }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -45,8 +45,6 @@
 #define KOKKOS_CUDA_ERROR_HPP

 #include <Kokkos_Macros.hpp>
-
-/* only compile this file if CUDA is enabled for Kokkos */
 #ifdef KOKKOS_ENABLE_CUDA

 namespace Kokkos { namespace Impl {
@ -67,3 +65,4 @@ inline void cuda_internal_safe_call( cudaError e , const char * name, const char

 #endif //KOKKOS_ENABLE_CUDA
 #endif //KOKKOS_CUDA_ERROR_HPP
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -44,11 +44,11 @@
 /*--------------------------------------------------------------------------*/
 /* Kokkos interfaces */

-#include <Kokkos_Core.hpp>
-
-/* only compile this file if CUDA is enabled for Kokkos */
+#include <Kokkos_Macros.hpp>
 #ifdef KOKKOS_ENABLE_CUDA

+#include <Kokkos_Core.hpp>
+
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <impl/Kokkos_Error.hpp>
@ -56,7 +56,7 @@

 /*--------------------------------------------------------------------------*/
 /* Standard 'C' libraries */
-#include <stdlib.h>
+#include <cstdlib>

 /* Standard 'C++' libraries */
 #include <vector>
@ -404,9 +404,23 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
    // Query what compute capability architecture a kernel executes:
    m_cudaArch = cuda_kernel_arch();

-    if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) {
+    int compiled_major = m_cudaArch / 100;
+    int compiled_minor = ( m_cudaArch % 100 ) / 10;
+
+    if ( compiled_major < 5 && cudaProp.major >= 5 ) {
+      std::stringstream ss;
+      ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for compute capability "
+         << compiled_major << "." << compiled_minor
+         << " (< 5.0) on device with compute capability "
+         << cudaProp.major << "." << cudaProp.minor
+         << " (>=5.0), this would give incorrect results!"
+         << std::endl ;
+      std::string msg = ss.str();
+      Kokkos::abort( msg.c_str() );
+    }
+    if ( compiled_major != cudaProp.major || compiled_minor != cudaProp.minor ) {
      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
-                << ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 )
+                << compiled_major << "." << compiled_minor
                << " on device with compute capability "
                << cudaProp.major << "." << cudaProp.minor
                << " , this will likely reduce potential performance."
@ -661,6 +675,15 @@ void CudaInternal::finalize()
 Cuda::size_type cuda_internal_multiprocessor_count()
 { return CudaInternal::singleton().m_multiProcCount ; }

+CudaSpace::size_type cuda_internal_maximum_concurrent_block_count()
+{
+  // Compute capability 5.0 through 6.2
+  enum : int { max_resident_blocks_per_multiprocessor = 32 };
+
+   return CudaInternal::singleton().m_multiProcCount
+          * max_resident_blocks_per_multiprocessor ;
+};
+
 Cuda::size_type cuda_internal_maximum_warp_count()
 { return CudaInternal::singleton().m_maxWarpCount ; }

@ -772,8 +795,10 @@ void Cuda::fence()
  Kokkos::Impl::cuda_device_synchronize();
 }

+const char* Cuda::name() { return "Cuda"; }
+
 } // namespace Kokkos
-
+#else
+void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
 #endif // KOKKOS_ENABLE_CUDA
-//----------------------------------------------------------------------------

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,19 +36,18 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

 #ifndef KOKKOS_CUDA_INTERNAL_HPP
 #define KOKKOS_CUDA_INTERNAL_HPP
-#include<iostream>
-#include <Kokkos_Macros.hpp>

-/* only compile this file if CUDA is enabled for Kokkos */
+#include <Kokkos_Macros.hpp>
 #ifdef KOKKOS_ENABLE_CUDA

+#include<iostream>
 #include <Cuda/Kokkos_Cuda_Error.hpp>

 namespace Kokkos { namespace Impl {
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -45,8 +45,6 @@
 #define KOKKOS_CUDA_REDUCESCAN_HPP

 #include <Kokkos_Macros.hpp>
-
-/* only compile this file if CUDA is enabled for Kokkos */
 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )

 #include <utility>
@ -63,6 +61,7 @@ namespace Kokkos {
 namespace Impl {

 //----------------------------------------------------------------------------
+// Shuffle operations require input to be a register (stack) variable

 template< typename T >
 __device__ inline
@ -140,77 +139,6 @@ void cuda_shfl_up( T & out , T const & in , int delta ,
  }
 }

-//----------------------------------------------------------------------------
-/** \brief  Reduce within a warp over blockDim.x, the "vector" dimension.
- *
- *  This will be called within a nested, intra-team parallel operation.
- *  Use shuffle operations to avoid conflicts with shared memory usage.
- *
- *  Requires:
- *    blockDim.x is power of 2
- *    blockDim.x <= 32 (one warp)
- *
- *  Cannot use "butterfly" pattern because floating point
- *  addition is non-associative.  Therefore, must broadcast
- *  the final result.
- */
-template< class Reducer >
-__device__ inline
-void cuda_intra_warp_vector_reduce( Reducer const & reducer )
-{
-  static_assert(
-    std::is_reference< typename Reducer::reference_type >::value , "" );
-
-  if ( 1 < blockDim.x ) {
-
-    typename Reducer::value_type tmp ;
-
-    for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
-
-      cuda_shfl_down( tmp , reducer.reference() , i , blockDim.x );
-
-      if ( threadIdx.x < i ) { reducer.join( reducer.data() , & tmp ); }
-    }
-
-    // Broadcast from root "lane" to all other "lanes"
-
-    cuda_shfl( reducer.reference() , reducer.reference() , 0 , blockDim.x );
-  }
-}
-
-/** \brief  Inclusive scan over blockDim.x, the "vector" dimension.
- *
- *  This will be called within a nested, intra-team parallel operation.
- *  Use shuffle operations to avoid conflicts with shared memory usage.
- *
- *  Algorithm is concurrent bottom-up reductions in triangular pattern
- *  where each CUDA thread is the root of a reduction tree from the
- *  zeroth CUDA thread to itself.
- *
- *  Requires:
- *    blockDim.x is power of 2
- *    blockDim.x <= 32 (one warp)
- */
-template< typename ValueType >
-__device__ inline
-void cuda_intra_warp_vector_inclusive_scan( ValueType & local )
-{
-  ValueType tmp ;
-
-  // Bottom up:
-  //   [t] += [t-1] if t >= 1
-  //   [t] += [t-2] if t >= 2
-  //   [t] += [t-4] if t >= 4
-  // ...
-
-  for ( int i = 1 ; i < blockDim.x ; i <<= 1 ) {
-
-    cuda_shfl_up( tmp , local , i , blockDim.x );
-
-    if ( i <= threadIdx.x ) { local += tmp ; }
-  }
-}
-
 //----------------------------------------------------------------------------
 /*
 *  Algorithmic constraints:
@ -247,12 +175,12 @@ inline void cuda_inter_warp_reduction( ValueType& value,

  #define STEP_WIDTH 4
  // Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
-  // The reason not to use ValueType directly is that for types with constructors it 
+  // The reason not to use ValueType directly is that for types with constructors it
  // could lead to race conditions
  __shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
  ValueType* result = (ValueType*) & sh_result;
-  const unsigned step = 32 / blockDim.x;
-  unsigned shift = STEP_WIDTH;
+  const int step = 32 / blockDim.x;
+  int shift = STEP_WIDTH;
  const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
  if(id < STEP_WIDTH ) {
    result[id] = value;
@ -297,7 +225,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
  //Do the intra-block reduction with shfl operations and static shared memory
  cuda_intra_block_reduction(value,join,max_active_thread);

-  const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+  const int id = threadIdx.y*blockDim.x + threadIdx.x;

  //One thread in the block writes block result to global scratch_memory
  if(id == 0 ) {
@ -329,35 +257,35 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT

      //Reduce all global values with splitting work over threads in one warp
      const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
-      for(int i=id; i<gridDim.x; i+=step_size) {
+      for(int i=id; i<(int)gridDim.x; i+=step_size) {
        value_type tmp = global[i];
        join(value, tmp);
      }

      //Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
-      if (blockDim.x*blockDim.y > 1) {
+      if (int(blockDim.x*blockDim.y) > 1) {
        value_type tmp = Kokkos::shfl_down(value, 1,32);
-        if( id + 1 < gridDim.x )
+        if( id + 1 < int(gridDim.x) )
          join(value, tmp);
      }
-      if (blockDim.x*blockDim.y > 2) {
+      if (int(blockDim.x*blockDim.y) > 2) {
        value_type tmp = Kokkos::shfl_down(value, 2,32);
-        if( id + 2 < gridDim.x )
+        if( id + 2 < int(gridDim.x) )
          join(value, tmp);
      }
-      if (blockDim.x*blockDim.y > 4) {
+      if (int(blockDim.x*blockDim.y) > 4) {
        value_type tmp = Kokkos::shfl_down(value, 4,32);
-        if( id + 4 < gridDim.x )
+        if( id + 4 < int(gridDim.x) )
          join(value, tmp);
      }
-      if (blockDim.x*blockDim.y > 8) {
+      if (int(blockDim.x*blockDim.y) > 8) {
        value_type tmp = Kokkos::shfl_down(value, 8,32);
-        if( id + 8 < gridDim.x )
+        if( id + 8 < int(gridDim.x) )
          join(value, tmp);
      }
-      if (blockDim.x*blockDim.y > 16) {
+      if (int(blockDim.x*blockDim.y) > 16) {
        value_type tmp = Kokkos::shfl_down(value, 16,32);
-        if( id + 16 < gridDim.x )
+        if( id + 16 < int(gridDim.x) )
          join(value, tmp);
      }
    }
@ -370,6 +298,166 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
 #endif
 }

+template< class ReducerType >
+__device__ inline
+typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
+cuda_intra_warp_reduction( const ReducerType& reducer,
+                           const int max_active_thread = blockDim.y) {
+
+  typedef typename ReducerType::value_type ValueType;
+
+  unsigned int shift = 1;
+
+  ValueType result = reducer.reference();
+  //Reduce over values from threads with different threadIdx.y
+  while(blockDim.x * shift < 32 ) {
+    const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
+    //Only join if upper thread is active (this allows non power of two for blockDim.y
+    if(threadIdx.y + shift < max_active_thread)
+      reducer.join(result , tmp);
+    shift*=2;
+  }
+
+  result = shfl(result,0,32);
+  reducer.reference() = result;
+}
+
+template< class ReducerType >
+__device__ inline
+typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
+cuda_inter_warp_reduction( const ReducerType& reducer,
+                           const int max_active_thread = blockDim.y) {
+
+  typedef typename ReducerType::value_type ValueType;
+
+  #define STEP_WIDTH 4
+  // Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
+  // The reason not to use ValueType directly is that for types with constructors it
+  // could lead to race conditions
+  __shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
+  ValueType* result = (ValueType*) & sh_result;
+  ValueType value = reducer.reference();
+  const int step = 32 / blockDim.x;
+  int shift = STEP_WIDTH;
+  const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
+  if(id < STEP_WIDTH ) {
+    result[id] = value;
+  }
+  __syncthreads();
+  while (shift<=max_active_thread/step) {
+    if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
+      reducer.join(result[id%STEP_WIDTH],value);
+    }
+    __syncthreads();
+    shift+=STEP_WIDTH;
+  }
+
+
+  value = result[0];
+  for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
+    reducer.join(value,result[i]);
+
+  reducer.reference() = value;
+}
+
+template< class ReducerType >
+__device__ inline
+typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
+cuda_intra_block_reduction( const ReducerType& reducer,
+                            const int max_active_thread = blockDim.y) {
+  cuda_intra_warp_reduction(reducer,max_active_thread);
+  cuda_inter_warp_reduction(reducer,max_active_thread);
+}
+
+template< class ReducerType>
+__device__ inline
+typename std::enable_if< Kokkos::is_reducer<ReducerType>::value , bool >::type
+cuda_inter_block_reduction( const ReducerType& reducer,
+                            Cuda::size_type * const m_scratch_space,
+                            Cuda::size_type * const m_scratch_flags,
+                            const int max_active_thread = blockDim.y) {
+#ifdef __CUDA_ARCH__
+  typedef typename ReducerType::value_type* pointer_type;
+  typedef typename ReducerType::value_type value_type;
+
+  //Do the intra-block reduction with shfl operations and static shared memory
+  cuda_intra_block_reduction(reducer,max_active_thread);
+
+  value_type value = reducer.reference();
+
+  const int id = threadIdx.y*blockDim.x + threadIdx.x;
+
+  //One thread in the block writes block result to global scratch_memory
+  if(id == 0 ) {
+    pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
+    *global = value;
+  }
+
+  //One warp of last block performs inter block reduction through loading the block values from global scratch_memory
+  bool last_block = false;
+
+  __syncthreads();
+  if ( id < 32 ) {
+    Cuda::size_type count;
+
+    //Figure out whether this is the last block
+    if(id == 0)
+      count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
+    count = Kokkos::shfl(count,0,32);
+
+    //Last block does the inter block reduction
+    if( count == gridDim.x - 1) {
+      //set flag back to zero
+      if(id == 0)
+        *m_scratch_flags = 0;
+      last_block = true;
+      reducer.init(value);
+
+      pointer_type const volatile global = (pointer_type) m_scratch_space ;
+
+      //Reduce all global values with splitting work over threads in one warp
+      const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
+      for(int i=id; i<(int)gridDim.x; i+=step_size) {
+        value_type tmp = global[i];
+        reducer.join(value, tmp);
+      }
+
+      //Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
+      if (int(blockDim.x*blockDim.y) > 1) {
+        value_type tmp = Kokkos::shfl_down(value, 1,32);
+        if( id + 1 < int(gridDim.x) )
+          reducer.join(value, tmp);
+      }
+      if (int(blockDim.x*blockDim.y) > 2) {
+        value_type tmp = Kokkos::shfl_down(value, 2,32);
+        if( id + 2 < int(gridDim.x) )
+          reducer.join(value, tmp);
+      }
+      if (int(blockDim.x*blockDim.y) > 4) {
+        value_type tmp = Kokkos::shfl_down(value, 4,32);
+        if( id + 4 < int(gridDim.x) )
+          reducer.join(value, tmp);
+      }
+      if (int(blockDim.x*blockDim.y) > 8) {
+        value_type tmp = Kokkos::shfl_down(value, 8,32);
+        if( id + 8 < int(gridDim.x) )
+          reducer.join(value, tmp);
+      }
+      if (int(blockDim.x*blockDim.y) > 16) {
+        value_type tmp = Kokkos::shfl_down(value, 16,32);
+        if( id + 16 < int(gridDim.x) )
+          reducer.join(value, tmp);
+      }
+    }
+  }
+
+  //The last block has in its thread=0 the global reduction value through "value"
+  return last_block;
+#else
+  return true;
+#endif
+}
+
 //----------------------------------------------------------------------------
 // See section B.17 of Cuda C Programming Guide Version 3.2
 // for discussion of
@ -529,11 +617,11 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
    size_type * const global = global_data + word_count.value * block_id ;

-#if (__CUDA_ARCH__ < 500)
-    for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
-#else
-    for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
-#endif
+//#if (__CUDA_ARCH__ < 500)
+    for ( int i = int(threadIdx.y) ; i < int(word_count.value) ; i += int(blockDim.y) ) { global[i] = shared[i] ; }
+//#else
+//    for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
+//#endif

  }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,15 +36,16 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

-#include <Kokkos_Core.hpp>
-
+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG )

+#include <Kokkos_Core.hpp>
+
 #include <impl/Kokkos_TaskQueue_impl.hpp>

 //----------------------------------------------------------------------------
@ -120,7 +121,7 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
      }

      if ( 0 == warp_lane ) {
-        queue->complete( task.ptr ); 
+        queue->complete( task.ptr );
      }
    }
  } while(1);
@ -157,7 +158,7 @@ printf("cuda_task_queue_execute before\n");
  // If not large enough then set the stack size, in bytes:
  //
  // CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
- 
+
  cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );

  CUDA_SAFE_CALL( cudaGetLastError() );
@ -173,7 +174,7 @@ printf("cuda_task_queue_execute after\n");
 }} /* namespace Kokkos::Impl */

 //----------------------------------------------------------------------------
-
+#else
+void KOKKOS_CORE_SRC_CUDA_KOKKOS_CUDA_TASK_PREVENT_LINK_ERROR() {}
 #endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG ) */

-
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@ -44,6 +44,7 @@
 #ifndef KOKKOS_IMPL_CUDA_TASK_HPP
 #define KOKKOS_IMPL_CUDA_TASK_HPP

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_TASKDAG )

 //----------------------------------------------------------------------------
@ -325,7 +326,7 @@ ValueType shfl_warp_broadcast
  return Kokkos::shfl(val, src_lane, width);
 }

-// all-reduce across corresponding vector lanes between team members within warp
+/*// all-reduce across corresponding vector lanes between team members within warp
 // assume vec_length*team_size == warp_size
 // blockDim.x == vec_length == stride
 // blockDim.y == team_size
@ -351,7 +352,7 @@ void parallel_reduce
                          loop_boundaries.thread.team_size(),
                          blockDim.x);
  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
-}
+}*/

 // all-reduce across corresponding vector lanes between team members within warp
 // if no join() provided, use sum
@ -382,13 +383,36 @@ void parallel_reduce
  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
 }

+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const ReducerType& reducer) {
+
+  typedef typename ReducerType::value_type ValueType;
+  //TODO what is the point of creating this temporary?
+  ValueType result = ValueType();
+  reducer.init(result);
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  strided_shfl_warp_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { reducer.join(val1,val2); },
+                          result,
+                          loop_boundaries.thread.team_size(),
+                          blockDim.x);
+  reducer.reference() = shfl_warp_broadcast<ValueType>( result, threadIdx.x, Impl::CudaTraits::WarpSize );
+}
 // all-reduce within team members within warp
 // assume vec_length*team_size == warp_size
 // blockDim.x == vec_length == stride
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename iType, class Lambda, typename ValueType, class JoinType >
+/*template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
@ -404,7 +428,7 @@ void parallel_reduce

  multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
-}
+}*/

 // all-reduce within team members within warp
 // if no join() provided, use sum
@ -436,6 +460,28 @@ void parallel_reduce
  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
 }

+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const ReducerType& reducer) {
+
+  typedef typename ReducerType::value_type ValueType;
+
+  ValueType result = ValueType();
+  reducer.init(result);
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  multi_shfl_warp_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { reducer.join(val1, val2); },
+                          result,
+                          blockDim.x);
+  reducer.reference() = shfl_warp_broadcast<ValueType>( result, 0, blockDim.x );
+}
 // scan across corresponding vector lanes between team members within warp
 // assume vec_length*team_size == warp_size
 // blockDim.x == vec_length == stride
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@ -0,0 +1,982 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_TEAM_HPP
+#define KOKKOS_CUDA_TEAM_HPP
+
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+
+#include <utility>
+#include <Kokkos_Parallel.hpp>
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Kokkos_Vectorization.hpp>
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename Type >
+struct CudaJoinFunctor {
+  typedef Type value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    volatile const value_type & input )
+    { update += input ; }
+};
+
+/**\brief  Team member_type passed to TeamPolicy or TeamTask closures.
+ *
+ *  Cuda thread blocks for team closures are dimensioned as:
+ *    blockDim.x == number of "vector lanes" per "thread"
+ *    blockDim.y == number of "threads" per team
+ *    blockDim.z == number of teams in a block
+ *  where
+ *    A set of teams exactly fill a warp OR a team is the whole block
+ *      ( 0 == WarpSize % ( blockDim.x * blockDim.y ) )
+ *      OR
+ *      ( 1 == blockDim.z )
+ *
+ *  Thus when 1 < blockDim.z the team is warp-synchronous
+ *  and __syncthreads should not be called in team collectives.
+ *
+ *  When multiple teams are mapped onto a single block then the
+ *  total available shared memory must be partitioned among teams.
+ */
+class CudaTeamMember {
+private:
+
+  typedef Kokkos::Cuda                           execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  void                * m_team_reduce ;
+  scratch_memory_space  m_team_shared ;
+  int                   m_team_reduce_size ;
+  int                   m_league_rank ;
+  int                   m_league_size ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space &
+    team_scratch(const int& level) const
+      { return m_team_shared.set_team_thread_mode(level,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space &
+    thread_scratch(const int& level) const
+      { return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const
+    {
+      #ifdef __CUDA_ARCH__
+      return threadIdx.y ;
+      #else
+      return 0;
+      #endif
+    }
+
+  KOKKOS_INLINE_FUNCTION int team_size() const
+    {
+      #ifdef __CUDA_ARCH__
+      return blockDim.y ;
+      #else
+      return 1;
+      #endif
+    }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+    {
+      #ifdef __CUDA_ARCH__
+      if ( 1 == blockDim.z ) __syncthreads();       // team == block
+      else                   __threadfence_block(); // team <= warp
+      #endif
+    }
+
+  //--------------------------------------------------------------------------
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast( ValueType & val, const int& thread_id) const
+    {
+      #ifdef __CUDA_ARCH__
+      if ( 1 == blockDim.z ) { // team == block
+        __syncthreads();
+        // Wait for shared data write until all threads arrive here
+        if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
+          *((ValueType*) m_team_reduce) = val ;
+        }
+        __syncthreads(); // Wait for shared data read until root thread writes
+        val = *((ValueType*) m_team_reduce);
+      }
+      else { // team <= warp
+        ValueType tmp( val ); // input might not be a register variable
+        cuda_shfl( val, tmp, blockDim.x * thread_id, blockDim.x * blockDim.y );
+      }
+      #endif
+    }
+
+  //--------------------------------------------------------------------------
+  /**\brief  Reduction across a team
+   *
+   *  Mapping of teams onto blocks:
+   *    blockDim.x  is "vector lanes"
+   *    blockDim.y  is team "threads"
+   *    blockDim.z  is number of teams per block
+   *
+   *  Requires:
+   *    blockDim.x is power two
+   *    blockDim.x <= CudaTraits::WarpSize
+   *    ( 0 == CudaTraits::WarpSize % ( blockDim.x * blockDim.y )
+   *      OR
+   *    ( 1 == blockDim.z )
+   */
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  team_reduce( ReducerType const & reducer ) const noexcept
+    {
+      #ifdef __CUDA_ARCH__
+
+      typedef typename ReducerType::value_type value_type ;
+
+      value_type tmp( reducer.reference() );
+
+      // reduce within the warp using shuffle
+
+      const int wx =
+        ( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
+
+      for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
+
+        cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
+
+        // Root of each vector lane reduces:
+        if ( 0 == threadIdx.x && wx < i ) {
+          reducer.join( tmp , reducer.reference() );
+        }
+      }
+
+      if ( 1 < blockDim.z ) { // team <= warp
+        // broadcast result from root vector lange of root thread
+
+        cuda_shfl( reducer.reference() , tmp
+                 , blockDim.x * threadIdx.y , CudaTraits::WarpSize );
+
+      }
+      else { // team == block
+        // Reduce across warps using shared memory
+        // Broadcast result within block
+
+        // Number of warps, blockDim.y may not be power of two:
+        const int nw  = ( blockDim.x * blockDim.y + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
+
+        // Warp index:
+        const int wy = ( blockDim.x * threadIdx.y ) >> CudaTraits::WarpIndexShift ;
+
+        // Number of shared memory entries for the reduction:
+        int nsh = m_team_reduce_size / sizeof(value_type);
+
+        // Using at most one entry per warp:
+        if ( nw < nsh ) nsh = nw ;
+
+        __syncthreads(); // Wait before shared data write
+
+        if ( 0 == wx && wy < nsh ) {
+          ((value_type*) m_team_reduce)[wy] = tmp ;
+        }
+
+        // When more warps than shared entries:
+        for ( int i = nsh ; i < nw ; i += nsh ) {
+
+          __syncthreads();
+
+          if ( 0 == wx && i <= wy ) {
+            const int k = wy - i ;
+            if ( k < nsh ) {
+              reducer.join( *((value_type*) m_team_reduce + k) , tmp );
+            }
+          }
+        }
+
+        __syncthreads();
+
+        // One warp performs the inter-warp reduction:
+
+        if ( 0 == wy ) {
+
+          // Start at power of two covering nsh
+
+          for ( int i = 1 << ( 32 - __clz(nsh-1) ) ; ( i >>= 1 ) ; ) {
+            const int k = wx + i ;
+            if ( wx < i && k < nsh ) {
+              reducer.join( ((value_type*)m_team_reduce)[wx]
+                          , ((value_type*)m_team_reduce)[k] );
+              __threadfence_block();
+            }
+          }
+        }
+
+        __syncthreads(); // Wait for reduction
+
+        // Broadcast result to all threads
+        reducer.reference() = *((value_type*)m_team_reduce);
+      }
+
+      #endif /* #ifdef __CUDA_ARCH__ */
+    }
+
+  //--------------------------------------------------------------------------
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION
+  Type team_scan( const Type & value , Type * const global_accum ) const
+    {
+      #ifdef __CUDA_ARCH__
+      Type * const base_data = (Type *) m_team_reduce ;
+
+      __syncthreads(); // Don't write in to shared data until all threads have entered this function
+
+      if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }
+
+      base_data[ threadIdx.y + 1 ] = value ;
+
+      Impl::cuda_intra_block_reduce_scan<true,Impl::CudaJoinFunctor<Type>,void>( Impl::CudaJoinFunctor<Type>() , base_data + 1 );
+
+      if ( global_accum ) {
+        if ( blockDim.y == threadIdx.y + 1 ) {
+          base_data[ blockDim.y ] = atomic_fetch_add( global_accum , base_data[ blockDim.y ] );
+        }
+        __syncthreads(); // Wait for atomic
+        base_data[ threadIdx.y ] += base_data[ blockDim.y ] ;
+      }
+
+      return base_data[ threadIdx.y ];
+      #else
+      return Type();
+      #endif
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const {
+    return this->template team_scan<Type>( value , 0 );
+  }
+
+  //----------------------------------------
+
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  vector_reduce( ReducerType const & reducer )
+    {
+
+      #ifdef __CUDA_ARCH__
+      if(blockDim.x == 1) return;
+
+      // Intra vector lane shuffle reduction:
+      typename ReducerType::value_type tmp ( reducer.reference() );
+
+      for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
+        cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
+        if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
+      }
+
+      // Broadcast from root lane to all other lanes.
+      // Cannot use "butterfly" algorithm to avoid the broadcast
+      // because floating point summation is not associative
+      // and thus different threads could have different results.
+
+      cuda_shfl( reducer.reference() , tmp , 0 , blockDim.x );
+      #endif
+    }
+
+  //--------------------------------------------------------------------------
+  /**\brief  Global reduction across all blocks
+   *
+   *  Return !0 if reducer contains the final value
+   */
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< is_reducer< ReducerType >::value , int >::type
+  global_reduce( ReducerType const & reducer
+               , int  * const global_scratch_flags
+               , void * const global_scratch_space
+               , void * const shmem
+               , int    const shmem_size
+               )
+    {
+    #ifdef __CUDA_ARCH__
+
+      typedef typename ReducerType::value_type value_type ;
+      typedef value_type volatile * pointer_type ;
+
+      // Number of shared memory entries for the reduction:
+      const int nsh = shmem_size / sizeof(value_type);
+
+      // Number of CUDA threads in the block, rank within the block
+      const int nid = blockDim.x * blockDim.y * blockDim.z ;
+      const int tid = threadIdx.x + blockDim.x * (
+                      threadIdx.y + blockDim.y * threadIdx.z );
+
+      // Reduces within block using all available shared memory
+      // Contributes if it is the root "vector lane"
+
+      // wn == number of warps in the block
+      // wx == which lane within the warp
+      // wy == which warp within the block
+
+      const int wn = ( nid + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
+      const int wx = tid &  CudaTraits::WarpIndexMask ;
+      const int wy = tid >> CudaTraits::WarpIndexShift ;
+
+      //------------------------
+      { // Intra warp shuffle reduction from contributing CUDA threads
+
+        value_type tmp( reducer.reference() );
+
+        for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
+
+          cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
+
+          // Root of each vector lane reduces "thread" contribution
+          if ( 0 == threadIdx.x && wx < i ) {
+            reducer.join( & tmp , reducer.data() );
+          }
+        }
+
+        // Reduce across warps using shared memory.
+        // Number of warps may not be power of two.
+
+        __syncthreads(); // Wait before shared data write
+
+        // Number of shared memory entries for the reduction
+        // is at most one per warp
+        const int nentry = wn < nsh ? wn : nsh ;
+
+        if ( 0 == wx && wy < nentry ) {
+          // Root thread of warp 'wy' has warp's value to contribute
+          ((value_type*) shmem)[wy] = tmp ;
+        }
+
+        __syncthreads(); // Wait for write to be visible to block
+
+        // When more warps than shared entries
+        // then warps must take turns joining their contribution
+        // to the designated shared memory entry.
+        for ( int i = nentry ; i < wn ; i += nentry ) {
+
+          const int k = wy - i ;
+
+          if ( 0 == wx && i <= wy && k < nentry ) {
+            // Root thread of warp 'wy' has warp's value to contribute
+            reducer.join( ((value_type*) shmem) + k , & tmp );
+          }
+
+          __syncthreads(); // Wait for write to be visible to block
+        }
+
+        // One warp performs the inter-warp reduction:
+
+        if ( 0 == wy ) {
+
+          // Start fan-in at power of two covering nentry
+
+          for ( int i = ( 1 << ( 32 - __clz(nentry-1) ) ) ; ( i >>= 1 ) ; ) {
+            const int k = wx + i ;
+            if ( wx < i && k < nentry ) {
+              reducer.join( ((pointer_type)shmem) + wx
+                          , ((pointer_type)shmem) + k );
+              __threadfence_block(); // Wait for write to be visible to warp
+            }
+          }
+        }
+      }
+      //------------------------
+      { // Write block's value to global_scratch_memory
+
+        int last_block = 0 ;
+
+        if ( 0 == wx ) {
+          reducer.copy( ((pointer_type)global_scratch_space)
+                        + blockIdx.x * reducer.length()
+                      , reducer.data() );
+
+          __threadfence(); // Wait until global write is visible.
+
+          last_block = gridDim.x ==
+                       1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
+
+          // If last block then reset count
+          if ( last_block ) *global_scratch_flags = 0 ;
+        }
+
+        last_block = __syncthreads_or( last_block );
+
+        if ( ! last_block ) return 0 ;
+
+      }
+      //------------------------
+      // Last block reads global_scratch_memory into shared memory.
+
+      const int nentry = nid < gridDim.x ?
+                       ( nid       < nsh ? nid       : nsh ) :
+                       ( gridDim.x < nsh ? gridDim.x : nsh ) ;
+
+      // nentry = min( nid , nsh , gridDim.x )
+
+      // whole block reads global memory into shared memory:
+
+      if ( tid < nentry ) {
+
+        const int offset = tid * reducer.length();
+
+        reducer.copy( ((pointer_type)shmem) + offset
+                    , ((pointer_type)global_scratch_space) + offset );
+
+        for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
+          reducer.join( ((pointer_type)shmem) + offset
+                      , ((pointer_type)global_scratch_space)
+                        + i * reducer.length() );
+        }
+      }
+
+      __syncthreads(); // Wait for writes to be visible to block
+
+      if ( 0 == wy ) {
+
+        // Iterate to reduce shared memory to single warp fan-in size
+
+        const int nreduce = CudaTraits::WarpSize < nentry
+                          ? CudaTraits::WarpSize : nentry ;
+
+        // nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x )
+
+        if ( wx < nreduce && nreduce < nentry ) {
+          for ( int i = nreduce + wx ; i < nentry ; i += nreduce ) {
+            reducer.join( ((pointer_type)shmem) + wx
+                        , ((pointer_type)shmem) + i );
+          }
+          __threadfence_block(); // Wait for writes to be visible to warp
+        }
+
+        // Start fan-in at power of two covering nentry
+
+        for ( int i = ( 1 << ( 32 - __clz(nreduce-1) ) ) ; ( i >>= 1 ) ; ) {
+          const int k = wx + i ;
+          if ( wx < i && k < nreduce ) {
+            reducer.join( ((pointer_type)shmem) + wx
+                        , ((pointer_type)shmem) + k );
+            __threadfence_block(); // Wait for writes to be visible to warp
+          }
+        }
+
+        if ( 0 == wx ) {
+          reducer.copy( reducer.data() , (pointer_type)shmem );
+          return 1 ;
+        }
+      }
+      return 0 ;
+
+    #else
+      return 0 ;
+    #endif
+    }
+
+  //----------------------------------------
+  // Private for the driver
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTeamMember( void * shared
+                , const int shared_begin
+                , const int shared_size
+                , void*     scratch_level_1_ptr
+                , const int scratch_level_1_size
+                , const int arg_league_rank
+                , const int arg_league_size )
+    : m_team_reduce( shared )
+    , m_team_shared( ((char *)shared) + shared_begin , shared_size,  scratch_level_1_ptr, scratch_level_1_size)
+    , m_team_reduce_size( shared_begin )
+    , m_league_rank( arg_league_rank )
+    , m_league_size( arg_league_size )
+    {}
+
+};
+
+} // namspace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
+  typedef iType index_type;
+  const CudaTeamMember& member;
+  const iType start;
+  const iType end;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
+    : member(thread_)
+    , start( 0 )
+    , end( count ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_,  const iType& begin_, const iType& end_)
+    : member(thread_)
+    , start( begin_ )
+    , end( end_ ) {}
+};
+
+
+
+template<typename iType>
+struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
+  typedef iType index_type;
+  const iType start;
+  const iType end;
+
+  KOKKOS_INLINE_FUNCTION
+  ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count)
+    : start( 0 ), end( count ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ThreadVectorRangeBoundariesStruct (const iType& count)
+    : start( 0 ), end( count ) {}
+};
+
+} // namespace Impl
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >
+TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) {
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
+}
+
+template< typename iType1, typename iType2 >
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
+                                       Impl::CudaTeamMember >
+TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
+  typedef typename std::common_type< iType1, iType2 >::type iType;
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
+ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::CudaTeamMember> PerTeam(const Impl::CudaTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::CudaTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::CudaTeamMember> PerThread(const Impl::CudaTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::CudaTeamMember>(thread);
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Inter-thread parallel_for.
+ *
+ *  Executes closure(iType i) for each i=[0..N).
+ *
+ * The range [0..N) is mapped to all threads of the the calling thread team.
+ */
+template<typename iType, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
+      loop_boundaries
+  , const Closure & closure
+  )
+{
+  #ifdef __CUDA_ARCH__
+  for( iType i = loop_boundaries.start + threadIdx.y
+     ; i < loop_boundaries.end
+     ; i += blockDim.y )
+    closure(i);
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Inter-thread parallel_reduce with a reducer.
+ *
+ *  Executes closure(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all threads of the
+ *  calling thread team and a summation of val is
+ *  performed and put into result.
+ */
+template< typename iType, class Closure, class ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
+      loop_boundaries
+  , const Closure & closure
+  , const ReducerType & reducer
+  )
+{
+#ifdef __CUDA_ARCH__
+
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start + threadIdx.y
+     ; i < loop_boundaries.end
+     ; i += blockDim.y ) {
+    closure(i,reducer.reference());
+  }
+
+  loop_boundaries.member.team_reduce( reducer );
+
+#endif
+}
+
+
+/** \brief  Inter-thread parallel_reduce assuming summation.
+ *
+ *  Executes closure(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all threads of the
+ *  calling thread team and a summation of val is
+ *  performed and put into result.
+ */
+template< typename iType, class Closure, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< ! Kokkos::is_reducer< ValueType >::value >::type
+parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
+      loop_boundaries
+  , const Closure & closure
+  , ValueType & result
+  )
+{
+#ifdef __CUDA_ARCH__
+
+  Kokkos::Experimental::Sum<ValueType> reducer(result);
+
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start + threadIdx.y
+     ; i < loop_boundaries.end
+     ; i += blockDim.y ) {
+    closure(i,result);
+  }
+
+  loop_boundaries.member.team_reduce( reducer );
+
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel_for.
+ *
+ *  Executes closure(iType i) for each i=[0..N)
+ *
+ * The range [0..N) is mapped to all vector lanes of the the calling thread.
+ */
+template<typename iType, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
+    loop_boundaries
+  , const Closure & closure
+  )
+{
+#ifdef __CUDA_ARCH__
+  for ( iType i = loop_boundaries.start + threadIdx.x
+      ; i < loop_boundaries.end
+      ; i += blockDim.x ) {
+    closure(i);
+  }
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel_reduce.
+ *
+ *  Calls closure(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed using +=
+ *  and output into result.
+ *
+ *  The identity value for the += operator is assumed to be the default
+ *  constructed value.
+ */
+template< typename iType, class Closure, class ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< is_reducer< ReducerType >::value >::type
+parallel_reduce
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>
+      const & loop_boundaries
+  , Closure const & closure
+  , ReducerType const & reducer )
+{
+#ifdef __CUDA_ARCH__
+
+  reducer.init( reducer.reference() );
+
+  for ( iType i = loop_boundaries.start + threadIdx.x
+      ; i < loop_boundaries.end
+      ; i += blockDim.x ) {
+    closure(i,reducer.reference());
+  }
+
+  Impl::CudaTeamMember::vector_reduce( reducer );
+
+#endif
+}
+
+/** \brief  Intra-thread vector parallel_reduce.
+ *
+ *  Calls closure(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed using +=
+ *  and output into result.
+ *
+ *  The identity value for the += operator is assumed to be the default
+ *  constructed value.
+ */
+template< typename iType, class Closure, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< ! is_reducer< ValueType >::value >::type
+parallel_reduce
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>
+      const & loop_boundaries
+  , Closure const & closure
+  , ValueType & result )
+{
+#ifdef __CUDA_ARCH__
+  result = ValueType();
+
+  for ( iType i = loop_boundaries.start + threadIdx.x
+      ; i < loop_boundaries.end
+      ; i += blockDim.x ) {
+    closure(i,result);
+  }
+
+  Impl::CudaTeamMember::vector_reduce(
+    Kokkos::Experimental::Sum<ValueType>(result ) );
+
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes in the
+ *  thread and a scan operation is performed.
+ *  The last call to closure has final == true.
+ */
+template< typename iType, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  ( const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
+      loop_boundaries
+  , const Closure & closure
+  )
+{
+
+#ifdef __CUDA_ARCH__
+
+  // Extract value_type from closure
+
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  // Loop through boundaries by vector-length chunks
+  // must scan at each iteration
+
+  value_type accum = 0 ;
+
+  // All thread "lanes" must loop the same number of times.
+  // Determine an loop end for all thread "lanes."
+  // Requires:
+  //   blockDim.x is power of two and thus
+  //     ( end % blockDim.x ) == ( end & ( blockDim.x - 1 ) )
+  //   1 <= blockDim.x <= CudaTraits::WarpSize
+
+  const int mask = blockDim.x - 1 ;
+  const int rem  = loop_boundaries.end & mask ; // == end % blockDim.x
+  const int end  = loop_boundaries.end + ( rem ? blockDim.x - rem : 0 );
+
+  for ( int i = threadIdx.x ; i < end ; i += blockDim.x ) {
+
+    value_type val = 0 ;
+
+    // First acquire per-lane contributions:
+    if ( i < loop_boundaries.end ) closure( i , val , false );
+
+    value_type sval = val ;
+
+    // Bottom up inclusive scan in triangular pattern
+    // where each CUDA thread is the root of a reduction tree
+    // from the zeroth "lane" to itself.
+    //  [t] += [t-1] if t >= 1
+    //  [t] += [t-2] if t >= 2
+    //  [t] += [t-4] if t >= 4
+    //  ...
+
+    for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
+      value_type tmp = 0 ;
+      Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
+      if ( j <= threadIdx.x ) { sval += tmp ; }
+    }
+
+    // Include accumulation and remove value for exclusive scan:
+    val = accum + sval - val ;
+
+    // Provide exclusive scan value:
+    if ( i < loop_boundaries.end ) closure( i , val , true );
+
+    // Accumulate the last value in the inclusive scan:
+    Impl::cuda_shfl( sval , sval , mask , blockDim.x );
+
+    accum += sval ;
+  }
+
+#endif
+}
+
+}
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0) lambda();
+#endif
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
+#endif
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0) lambda(val);
+  val = shfl(val,0,blockDim.x);
+#endif
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0 && threadIdx.y == 0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+#endif
+}
+
+} // namespace Kokkos
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* #ifndef KOKKOS_CUDA_TEAM_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -44,8 +44,6 @@
 #define KOKKOS_CUDA_VECTORIZATION_HPP

 #include <Kokkos_Macros.hpp>
-
-/* only compile this file if CUDA is enabled for Kokkos */
 #ifdef KOKKOS_ENABLE_CUDA

 #include <Kokkos_Cuda.hpp>
@ -296,3 +294,4 @@ namespace Impl {

 #endif // KOKKOS_ENABLE_CUDA
 #endif
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -44,7 +44,7 @@
 #ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
 #define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP

-/* only compile this file if CUDA is enabled for Kokkos */
+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_CUDA )

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -46,7 +46,7 @@

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
-#include "Kokkos_Macros.hpp"
+#include <Kokkos_Macros.hpp>
 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )

 #include <cuda.h>
@ -82,6 +82,8 @@ void cuda_abort( const char * const message )

 } // namespace Impl
 } // namespace Kokkos
+#else
+void KOKKOS_CORE_SRC_CUDA_ABORT_PREVENT_LINK_ERROR() {}
 #endif /* #if defined(__CUDACC__) && defined( KOKKOS_ENABLE_CUDA ) */
 #endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */

--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@ -236,13 +236,13 @@ struct MDRangePolicy
    MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );

 #else
-    if(m_lower.size()!=rank || m_upper.size() != rank)
+    if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
      Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");

    for ( auto i = 0; i < rank; ++i ) {
      m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
      m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
-      if(tile.size()==rank)
+      if(static_cast<int>(tile.size())==rank)
        m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
      else
        m_tile[i] = 0;
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@ -44,6 +44,7 @@
 #define KOKKOS_COMPLEX_HPP

 #include <Kokkos_Atomic.hpp>
+#include <Kokkos_NumericTraits.hpp>
 #include <complex>
 #include <iostream>

@ -324,9 +325,30 @@ public:
    im_ /= src;
    return *this;
  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const complex<RealType>& src) {
+    return (re_ == src.re_) && (im_ == src.im_);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const RealType src) {
+    return (re_ == src) && (im_ == RealType(0));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const complex<RealType>& src) {
+    return (re_ != src.re_) || (im_ != src.im_);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const RealType src) {
+    return (re_ != src) || (im_ != RealType(0));
+  }
+
 };

-//! Binary + operator for complex.
+//! Binary + operator for complex complex.
 template<class RealType>
 KOKKOS_INLINE_FUNCTION
 complex<RealType>
@ -334,6 +356,22 @@ operator + (const complex<RealType>& x, const complex<RealType>& y) {
  return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
 }

+//! Binary + operator for complex scalar.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator + (const complex<RealType>& x, const RealType& y) {
+  return complex<RealType> (x.real () + y , x.imag ());
+}
+
+//! Binary + operator for scalar complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator + (const RealType& x, const complex<RealType>& y) {
+  return complex<RealType> (x + y.real (), y.imag ());
+}
+
 //! Unary + operator for complex.
 template<class RealType>
 KOKKOS_INLINE_FUNCTION
@ -350,6 +388,22 @@ operator - (const complex<RealType>& x, const complex<RealType>& y) {
  return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
 }

+//! Binary - operator for complex scalar.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator - (const complex<RealType>& x, const RealType& y) {
+  return complex<RealType> (x.real () - y , x.imag ());
+}
+
+//! Binary - operator for scalar complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator - (const RealType& x, const complex<RealType>& y) {
+  return complex<RealType> (x - y.real (), - y.imag ());
+}
+
 //! Unary - operator for complex.
 template<class RealType>
 KOKKOS_INLINE_FUNCTION
@ -395,6 +449,16 @@ operator * (const RealType& x, const complex<RealType>& y) {
  return complex<RealType> (x * y.real (), x * y.imag ());
 }

+/// \brief Binary * operator for RealType times complex.
+///
+/// This function exists because the compiler doesn't know that
+/// RealType and complex<RealType> commute with respect to operator*.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator * (const complex<RealType>& y, const RealType& x) {
+  return complex<RealType> (x * y.real (), x * y.imag ());
+}

 //! Imaginary part of a complex number.
 template<class RealType>
@ -415,7 +479,25 @@ template<class RealType>
 KOKKOS_INLINE_FUNCTION
 RealType abs (const complex<RealType>& x) {
  // FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
-  return ::sqrt (real (x) * real (x) + imag (x) * imag (x));
+  return std::sqrt (real (x) * real (x) + imag (x) * imag (x));
+}
+
+//! Power of a complex number
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+Kokkos::complex<RealType> pow (const complex<RealType>& x, const RealType& e) {
+  RealType r = abs(x);
+  RealType phi = std::atan(x.imag()/x.real());
+  return std::pow(r,e) * Kokkos::complex<RealType>(std::cos(phi*e),std::sin(phi*e)); 
+}
+
+//! Square root of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+Kokkos::complex<RealType> sqrt (const complex<RealType>& x) {
+  RealType r = abs(x);
+  RealType phi = std::atan(x.imag()/x.real());
+  return std::sqrt(r) * Kokkos::complex<RealType>(std::cos(phi*0.5),std::sin(phi*0.5));
 }

 //! Conjugate of a complex number.
@ -425,6 +507,19 @@ complex<RealType> conj (const complex<RealType>& x) {
  return complex<RealType> (real (x), -imag (x));
 }

+//! Exponential of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType> exp (const complex<RealType>& x) {
+  return std::exp(x.real()) * complex<RealType> (std::cos (x.imag()),  std::sin(x.imag()));
+}
+
+//! Exponential of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType> pow (const complex<RealType>& x) {
+  return std::exp(x.real()) * complex<RealType> (std::cos (x.imag()),  std::sin(x.imag()));
+}

 //! Binary operator / for complex and real numbers
 template<class RealType1, class RealType2>
@ -461,6 +556,14 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
  }
 }

+//! Binary operator / for complex and real numbers
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<RealType1>
+operator / (const RealType1& x, const complex<RealType2>& y) {
+  return complex<RealType1> (x)/y;
+}
+
 //! Equality operator for two complex numbers.
 template<class RealType>
 KOKKOS_INLINE_FUNCTION
@ -468,9 +571,13 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
  return real (x) == real (y) && imag (x) == imag (y);
 }

-//! Equality operator for std::complex and Kokkos::complex.
+/// \brief Equality operator for std::complex and Kokkos::complex.
+///
+/// This cannot be a device function, since std::real is not.
+/// Otherwise, CUDA builds will give compiler warnings ("warning:
+/// calling a constexpr __host__ function("real") from a __host__
+/// __device__ function("operator==") is not allowed").
 template<class RealType>
-KOKKOS_INLINE_FUNCTION
 bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
  return std::real (x) == real (y) && std::imag (x) == imag (y);
 }
@ -533,6 +640,15 @@ std::ostream& operator >> (std::ostream& os, complex<RealType>& x) {
 }


+template<class T>
+struct reduction_identity<Kokkos::complex<T> > {
+  typedef reduction_identity<T> t_red_ident;
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> sum()
+      {return Kokkos::complex<T>(t_red_ident::sum(),t_red_ident::sum());}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> prod()
+      {return Kokkos::complex<T>(t_red_ident::prod(),t_red_ident::sum());}
+};
+
 } // namespace Kokkos

 #endif // KOKKOS_COMPLEX_HPP
--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@ -200,7 +200,14 @@ public:
    , Kokkos::DefaultHostExecutionSpace , execution_space
    >::type  host_execution_space ;
 #else
-  typedef execution_space  host_execution_space ;
+  #if defined( KOKKOS_ENABLE_OPENMPTARGET )
+    typedef typename std::conditional
+      < std::is_same< execution_space , Kokkos::Experimental::OpenMPTarget >::value
+      , Kokkos::DefaultHostExecutionSpace , execution_space
+      >::type  host_execution_space ;
+  #else
+    typedef execution_space  host_execution_space ;
+  #endif
 #endif

  typedef typename std::conditional
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -57,11 +57,16 @@
 #include <Kokkos_OpenMP.hpp>
 #endif

+//#if defined( KOKKOS_ENABLE_OPENMPTARGET )
+#include <Kokkos_OpenMPTarget.hpp>
+#include <Kokkos_OpenMPTargetSpace.hpp>
+//#endif
+
 #if defined( KOKKOS_ENABLE_QTHREADS )
 #include <Kokkos_Qthreads.hpp>
 #endif

-#if defined( KOKKOS_ENABLE_PTHREAD )
+#if defined( KOKKOS_ENABLE_THREADS )
 #include <Kokkos_Threads.hpp>
 #endif

@ -69,8 +74,8 @@
 #include <Kokkos_Cuda.hpp>
 #endif

-#include <Kokkos_MemoryPool.hpp>
 #include <Kokkos_Pair.hpp>
+#include <Kokkos_MemoryPool.hpp>
 #include <Kokkos_Array.hpp>
 #include <Kokkos_View.hpp>
 #include <Kokkos_Vectorization.hpp>
@ -167,3 +172,4 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
 //----------------------------------------------------------------------------

 #endif
+
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@ -96,7 +96,7 @@ class Serial;    ///< Execution space main process on CPU.
 class Qthreads;  ///< Execution space with Qthreads back-end.
 #endif

-#if defined( KOKKOS_ENABLE_PTHREAD )
+#if defined( KOKKOS_ENABLE_THREADS )
 class Threads;   ///< Execution space with pthreads back-end.
 #endif

@ -104,6 +104,14 @@ class Threads;   ///< Execution space with pthreads back-end.
 class OpenMP;    ///< OpenMP execution space.
 #endif

+#if defined( KOKKOS_ENABLE_OPENMPTARGET )
+namespace Experimental {
+class OpenMPTarget;    ///< OpenMPTarget execution space.
+class OpenMPTargetSpace;
+}
+#endif
+
+
 #if defined( KOKKOS_ENABLE_CUDA )
 class CudaSpace;            ///< Memory space on Cuda GPU
 class CudaUVMSpace;         ///< Memory space on Cuda GPU with UVM
@ -121,12 +129,14 @@ struct Device;

 /// Define Kokkos::DefaultExecutionSpace as per configuration option
 /// or chosen from the enabled execution spaces in the following order:
-/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
+/// Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial

 namespace Kokkos {

 #if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
  typedef Cuda DefaultExecutionSpace;
+#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
+  typedef Experimental::OpenMPTarget DefaultExecutionSpace ;
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
  typedef OpenMP DefaultExecutionSpace;
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
@ -136,7 +146,7 @@ namespace Kokkos {
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
  typedef Serial DefaultExecutionSpace;
 #else
-#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
 #endif

 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
@ -149,7 +159,7 @@ namespace Kokkos {
  typedef Serial DefaultHostExecutionSpace;
 #elif defined( KOKKOS_ENABLE_OPENMP )
  typedef OpenMP DefaultHostExecutionSpace;
-#elif defined( KOKKOS_ENABLE_PTHREAD )
+#elif defined( KOKKOS_ENABLE_THREADS )
  typedef Threads DefaultHostExecutionSpace;
 //#elif defined( KOKKOS_ENABLE_QTHREADS )
 //  typedef Qthreads DefaultHostExecutionSpace;
@ -254,6 +264,21 @@ template< class FunctorType, class ExecPolicy, class ExecutionSapce =

 } // namespace Impl

+namespace Experimental {
+template<class ScalarType , class Space = HostSpace> struct Sum;
+template<class ScalarType , class Space = HostSpace> struct Prod;
+template<class ScalarType , class Space = HostSpace> struct Min;
+template<class ScalarType , class Space = HostSpace> struct Max;
+template<class ScalarType , class Space = HostSpace> struct MinMax;
+template<class ScalarType , class Index, class Space = HostSpace> struct MinLoc;
+template<class ScalarType , class Index, class Space = HostSpace> struct MaxLoc;
+template<class ScalarType , class Index, class Space = HostSpace> struct MinMaxLoc;
+template<class ScalarType , class Space = HostSpace> struct BAnd;
+template<class ScalarType , class Space = HostSpace> struct BOr;
+template<class ScalarType , class Space = HostSpace> struct LAnd;
+template<class ScalarType , class Space = HostSpace> struct LOr;
+}
 } // namespace Kokkos

 #endif /* #ifndef KOKKOS_CORE_FWD_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@ -44,12 +44,11 @@
 #ifndef KOKKOS_CUDA_HPP
 #define KOKKOS_CUDA_HPP

-#include <Kokkos_Core_fwd.hpp>
-
-// If CUDA execution space is enabled then use this header file.
-
+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_CUDA )

+#include <Kokkos_Core_fwd.hpp>
+
 #include <iosfwd>
 #include <vector>

@ -214,6 +213,8 @@ public:
  //@}
  //--------------------------------------------------------------------------

+  static const char* name();
+
 private:

  cudaStream_t m_stream ;
@ -291,6 +292,7 @@ struct VerifyExecutionCanAccessMemorySpace

 #include <Cuda/Kokkos_CudaExec.hpp>
 #include <Cuda/Kokkos_Cuda_View.hpp>
+#include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Cuda/Kokkos_Cuda_Parallel.hpp>
 #include <Cuda/Kokkos_Cuda_Task.hpp>

@ -300,5 +302,3 @@ struct VerifyExecutionCanAccessMemorySpace
 #endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
 #endif /* #ifndef KOKKOS_CUDA_HPP */

-
-
--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -44,10 +44,11 @@
 #ifndef KOKKOS_CUDASPACE_HPP
 #define KOKKOS_CUDASPACE_HPP

-#include <Kokkos_Core_fwd.hpp>
-
+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_CUDA )

+#include <Kokkos_Core_fwd.hpp>
+
 #include <iosfwd>
 #include <typeinfo>
 #include <string>
@ -717,7 +718,7 @@ private:
  static ::cudaTextureObject_t
  attach_texture_object( const unsigned sizeof_alias
                       , void * const   alloc_ptr
-                       , const size_t   alloc_size ); 
+                       , const size_t   alloc_size );

  static RecordBase s_root_record ;

--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@ -565,5 +565,3 @@ ThreadVectorRange( const TeamMemberType&, const iType& count );

 #endif /* #define KOKKOS_EXECPOLICY_HPP */

-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@ -44,12 +44,11 @@
 #ifndef KOKKOS_HBWSPACE_HPP
 #define KOKKOS_HBWSPACE_HPP

-#include <Kokkos_HostSpace.hpp>
-
-/*--------------------------------------------------------------------------*/
-
+#include <Kokkos_Macros.hpp>
 #ifdef KOKKOS_ENABLE_HBWSPACE

+#include <Kokkos_HostSpace.hpp>
+
 namespace Kokkos {

 namespace Experimental {
@ -114,7 +113,7 @@ public:
 //  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_OPENMP )
  typedef Kokkos::OpenMP    execution_space;
-#elif defined( KOKKOS_ENABLE_PTHREAD )
+#elif defined( KOKKOS_ENABLE_THREADS )
  typedef Kokkos::Threads   execution_space;
 //#elif defined( KOKKOS_ENABLE_QTHREADS )
 //  typedef Kokkos::Qthreads  execution_space;
@ -348,5 +347,5 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace, Kokk
 } // namespace Kokkos

 #endif
-
 #endif // #define KOKKOS_HBWSPACE_HPP
+
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@ -117,7 +117,7 @@ public:
 //  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_OPENMP )
  typedef Kokkos::OpenMP    execution_space;
-#elif defined( KOKKOS_ENABLE_PTHREAD )
+#elif defined( KOKKOS_ENABLE_THREADS )
  typedef Kokkos::Threads   execution_space;
 //#elif defined( KOKKOS_ENABLE_QTHREADS )
 //  typedef Kokkos::Qthreads  execution_space;
@ -265,7 +265,7 @@ public:
    return (SharedAllocationRecord *) 0;
 #endif
  }
-   
+

  /**\brief  Allocate tracked memory in the space */
  static
@ -316,3 +316,4 @@ struct DeepCopy< HostSpace, HostSpace, ExecutionSpace > {
 } // namespace Kokkos

 #endif // #define KOKKOS_HOSTSPACE_HPP
+
--- a/lib/kokkos/core/src/Kokkos_Layout.hpp
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -47,7 +47,7 @@
 #ifndef KOKKOS_LAYOUT_HPP
 #define KOKKOS_LAYOUT_HPP

-#include <stddef.h>
+#include <cstddef>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Tags.hpp>

@ -62,7 +62,7 @@ enum { ARRAY_LAYOUT_MAX_RANK = 8 };
 ///
 /// This is an example of a \c MemoryLayout template parameter of
 /// View.  The memory layout describes how View maps from a
-/// multi-index (i0, i1, ..., ik) to a memory location.  
+/// multi-index (i0, i1, ..., ik) to a memory location.
 ///
 /// "Layout left" indicates a mapping where the leftmost index i0
 /// refers to contiguous access, and strides increase for dimensions
@ -95,7 +95,7 @@ struct LayoutLeft {
 ///
 /// This is an example of a \c MemoryLayout template parameter of
 /// View.  The memory layout describes how View maps from a
-/// multi-index (i0, i1, ..., ik) to a memory location.  
+/// multi-index (i0, i1, ..., ik) to a memory location.
 ///
 /// "Right layout" indicates a mapping where the rightmost index ik
 /// refers to contiguous access, and strides increase for dimensions
@ -130,7 +130,7 @@ struct LayoutStride {
  typedef LayoutStride array_layout ;

  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
-  size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ; 
+  size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;

  LayoutStride( LayoutStride const & ) = default ;
  LayoutStride( LayoutStride && ) = default ;
@ -192,7 +192,7 @@ struct LayoutStride {
 ///
 /// This is an example of a \c MemoryLayout template parameter of
 /// View.  The memory layout describes how View maps from a
-/// multi-index (i0, i1, ..., ik) to a memory location.  
+/// multi-index (i0, i1, ..., ik) to a memory location.
 ///
 /// "Tiled layout" indicates a mapping to contiguously stored
 /// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -48,9 +48,10 @@
 /** Pick up configure / build options via #define macros:
 *
 *  KOKKOS_ENABLE_CUDA                Kokkos::Cuda execution and memory spaces
- *  KOKKOS_ENABLE_PTHREAD             Kokkos::Threads execution space
+ *  KOKKOS_ENABLE_THREADS             Kokkos::Threads execution space
 *  KOKKOS_ENABLE_QTHREADS            Kokkos::Qthreads execution space
 *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP execution space
+ *  KOKKOS_ENABLE_OPENMPTARGET        Kokkos::Experimental::OpenMPTarget execution space
 *  KOKKOS_ENABLE_HWLOC               HWLOC library is available.
 *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK  Insert array bounds checks, is expensive!
 *  KOKKOS_ENABLE_MPI                 Negotiate MPI/execution space interactions.
@ -414,6 +415,7 @@
 // There is zero or one default execution space specified.

 #if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET ) ? 1 : 0 ) + \
          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS ) ? 1 : 0 ) + \
@ -424,15 +426,18 @@
 // If default is not specified then chose from enabled execution spaces.
 // Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
 #if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
 //#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
 #elif defined( KOKKOS_ENABLE_CUDA )
  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+#elif defined( KOKKOS_ENABLE_OPENMPTARGET )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET
 #elif defined( KOKKOS_ENABLE_OPENMP )
  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
-#elif defined( KOKKOS_ENABLE_PTHREAD )
+#elif defined( KOKKOS_ENABLE_THREADS )
  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
 //#elif defined( KOKKOS_ENABLE_QTHREADS )
 //  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
@ -459,10 +464,17 @@
 #endif

 //----------------------------------------------------------------------------
-// Enable Profiling by default
+// If compiling with CUDA then must be using CUDA 8 or better
+// and use relocateable device code to enable the task policy.
+// nvcc relocatable device code option: --relocatable-device-code=true

-#ifndef KOKKOS_ENABLE_PROFILING
-  #define KOKKOS_ENABLE_PROFILING 1
+#if ( defined( KOKKOS_ENABLE_CUDA ) )
+  #if ( 8000 <= CUDA_VERSION ) && defined( KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE )
+  #define KOKKOS_ENABLE_TASKDAG
+  #endif
+#else
+  #define KOKKOS_ENABLE_TASKDAG
 #endif

 #endif // #ifndef KOKKOS_MACROS_HPP
+
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
--- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
@ -0,0 +1,217 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_NUMERICTRAITS_HPP
+#define KOKKOS_NUMERICTRAITS_HPP
+
+#include<climits>
+#include<cfloat>
+
+namespace Kokkos {
+
+template<class T>
+struct reduction_identity; /*{
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T sum() { return T(); }  // 0
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T prod()  // 1
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom prod reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T max()   // minimum value
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom max reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T min()   // maximum value
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom min reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T bor()   // 0, only for integer type
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom bor reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T band()  // !0, only for integer type
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom band reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T lor()   // 0, only for integer type
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom lor reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T land()  // !0, only for integer type
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom land reduction type"); return T(); }
+};*/
+
+template<>
+struct reduction_identity<signed char> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char sum()  {return static_cast<signed char>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char prod() {return static_cast<signed char>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char max()  {return SCHAR_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char min()  {return SCHAR_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char bor()  {return static_cast<signed char>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char band() {return ~static_cast<signed char>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char lor()  {return static_cast<signed char>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char land() {return static_cast<signed char>(1);}
+};
+
+template<>
+struct reduction_identity<short> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short sum()  {return static_cast<short>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short prod() {return static_cast<short>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short max()  {return SHRT_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short min()  {return SHRT_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short bor()  {return static_cast<short>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short band() {return ~static_cast<short>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short lor()  {return static_cast<short>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short land() {return static_cast<short>(1);}
+};
+
+template<>
+struct reduction_identity<int> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int sum()  {return static_cast<int>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int prod() {return static_cast<int>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int max()  {return INT_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int min()  {return INT_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int bor()  {return static_cast<int>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int band() {return ~static_cast<int>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int lor()  {return static_cast<int>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int land() {return static_cast<int>(1);}
+};
+
+template<>
+struct reduction_identity<long> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long sum()  {return static_cast<long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long prod() {return static_cast<long>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long max()  {return LLONG_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long min()  {return LLONG_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long bor()  {return static_cast<long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long band() {return ~static_cast<long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long lor()  {return static_cast<long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long land() {return static_cast<long>(1);}
+};
+
+template<>
+struct reduction_identity<long long> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long sum()  {return static_cast<long long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long prod() {return static_cast<long long>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long max()  {return LLONG_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long min()  {return LLONG_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long bor()  {return static_cast<long long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long band() {return ~static_cast<long long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long lor()  {return static_cast<long long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long land() {return static_cast<long long>(1);}
+};
+
+template<>
+struct reduction_identity<unsigned char> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char sum()  {return static_cast<unsigned char>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char prod() {return static_cast<unsigned char>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char max()  {return static_cast<unsigned char>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char min()  {return UCHAR_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char bor()  {return static_cast<unsigned char>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char band() {return ~static_cast<unsigned char>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char lor()  {return static_cast<unsigned char>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char land() {return static_cast<unsigned char>(1);}
+};
+
+template<>
+struct reduction_identity<unsigned short> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short sum()  {return static_cast<unsigned short>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short prod() {return static_cast<unsigned short>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short max()  {return static_cast<unsigned short>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short min()  {return USHRT_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short bor()  {return static_cast<unsigned short>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short band() {return ~static_cast<unsigned short>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short lor()  {return static_cast<unsigned short>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short land() {return static_cast<unsigned short>(1);}
+};
+
+template<>
+struct reduction_identity<unsigned int> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int sum()  {return static_cast<unsigned int>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int prod() {return static_cast<unsigned int>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int max()  {return static_cast<unsigned int>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int min()  {return UINT_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int bor()  {return static_cast<unsigned int>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int band() {return ~static_cast<unsigned int>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int lor()  {return static_cast<unsigned int>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int land() {return static_cast<unsigned int>(1);}
+};
+
+template<>
+struct reduction_identity<unsigned long> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long sum()  {return static_cast<unsigned long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long prod() {return static_cast<unsigned long>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long max()  {return static_cast<unsigned long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long min()  {return ULONG_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long bor()  {return static_cast<unsigned long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long band() {return ~static_cast<unsigned long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long lor()  {return static_cast<unsigned long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long land() {return static_cast<unsigned long>(1);}
+};
+
+template<>
+struct reduction_identity<unsigned long long> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long sum()  {return static_cast<unsigned long long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long prod() {return static_cast<unsigned long long>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long max()  {return static_cast<unsigned long long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long min()  {return ULLONG_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long bor()  {return static_cast<unsigned long long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long band() {return ~static_cast<unsigned long long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long lor()  {return static_cast<unsigned long long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long land() {return static_cast<unsigned long long>(1);}
+};
+
+template<>
+struct reduction_identity<float> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum()  {return static_cast<float>(0.0f);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max()  {return FLT_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float min()  {return FLT_MAX;}
+};
+
+template<>
+struct reduction_identity<double> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum()  {return static_cast<double>(0.0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double max()  {return DBL_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double min()  {return DBL_MAX;}
+};
+
+template<>
+struct reduction_identity<long double> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum()  {return static_cast<long double>(0.0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max()  {return LDBL_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min()  {return LDBL_MAX;}
+};
+
+}
+
+#endif
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@ -44,22 +44,23 @@
 #ifndef KOKKOS_OPENMP_HPP
 #define KOKKOS_OPENMP_HPP

-#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP)

-#if defined( KOKKOS_ENABLE_OPENMP) && !defined(_OPENMP)
+#if !defined(_OPENMP)
 #error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
 #endif

-#if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP )
-
-#include <omp.h>
+#include <Kokkos_Core_fwd.hpp>

 #include <cstddef>
 #include <iosfwd>
 #include <Kokkos_HostSpace.hpp>
+
 #ifdef KOKKOS_ENABLE_HBWSPACE
 #include <Kokkos_HBWSpace.hpp>
 #endif
+
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
@ -79,26 +80,26 @@ public:
  //@{

  //! Tag this class as a kokkos execution space
-  typedef OpenMP                execution_space ;
+  using execution_space = OpenMP;
  #ifdef KOKKOS_ENABLE_HBWSPACE
-  typedef Experimental::HBWSpace memory_space ;
+  using memory_space = Experimental::HBWSpace;
  #else
-  typedef HostSpace             memory_space ;
+  using memory_space = HostSpace;
  #endif
  //! This execution space preferred device_type
-  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  using device_type = Kokkos::Device<execution_space,memory_space>;

-  typedef LayoutRight           array_layout ;
-  typedef memory_space::size_type  size_type ;
+  using array_layout = LayoutRight;
+  using size_type = memory_space::size_type;

-  typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
+  using scratch_memory_space = ScratchMemorySpace< OpenMP >;

  //@}
  //------------------------------------
  //! \name Functions that all Kokkos execution spaces must implement.
  //@{

-  inline static bool in_parallel() { return omp_in_parallel(); }
+  inline static bool in_parallel();

  /** \brief  Set the device in a "sleep" state. A noop for OpenMP.  */
  static bool sleep();
@ -153,6 +154,8 @@ public:

  KOKKOS_INLINE_FUNCTION static
  unsigned hardware_thread_id() { return thread_pool_rank(); }
+
+  static const char* name();
 };

 } // namespace Kokkos
@ -164,7 +167,7 @@ namespace Kokkos {
 namespace Impl {

 template<>
-struct MemorySpaceAccess 
+struct MemorySpaceAccess
  < Kokkos::OpenMP::memory_space
  , Kokkos::OpenMP::scratch_memory_space
  >
@ -191,7 +194,7 @@ struct VerifyExecutionCanAccessMemorySpace
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/

-#include <OpenMP/Kokkos_OpenMPexec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
 #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMP_Task.hpp>

@ -201,4 +204,3 @@ struct VerifyExecutionCanAccessMemorySpace
 #endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */
 #endif /* #ifndef KOKKOS_OPENMP_HPP */

-
--- a/lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
@ -0,0 +1,186 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPTARGET_HPP
+#define KOKKOS_OPENMPTARGET_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP )
+
+#include <omp.h>
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_OpenMPTargetSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+/// \class OpenMPTarget
+/// \brief Kokkos device for multicore processors in the host memory space.
+class OpenMPTarget {
+public:
+  //------------------------------------
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef OpenMPTarget                execution_space ;
+  typedef OpenMPTargetSpace           memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef LayoutLeft           array_layout ;
+  typedef memory_space::size_type  size_type ;
+
+  typedef ScratchMemorySpace< OpenMPTarget > scratch_memory_space ;
+
+  //@}
+  //------------------------------------
+  //! \name Functions that all Kokkos execution spaces must implement.
+  //@{
+
+  inline static bool in_parallel() { return omp_in_parallel(); }
+
+  /** \brief  Set the device in a "sleep" state. A noop for OpenMPTarget.  */
+  static bool sleep();
+
+  /** \brief Wake the device from the 'sleep' state. A noop for OpenMPTarget. */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functors complete. A noop for OpenMPTarget. */
+  static void fence() {}
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  /// \brief Free any resources being consumed by the device.
+  static void finalize();
+
+  /** \brief  Initialize the device.
+   *
+   *  1) If the hardware locality library is enabled and OpenMPTarget has not
+   *     already bound threads then bind OpenMPTarget threads to maximize
+   *     core utilization and group for memory hierarchy locality.
+   *
+   *  2) Allocate a HostThread for each OpenMPTarget thread to hold its
+   *     topology and fan in/out data.
+   */
+  static void initialize( unsigned thread_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 );
+
+  static int is_initialized();
+
+  /** \brief  Return the maximum amount of concurrency.  */
+  static int concurrency();
+
+  //@}
+  //------------------------------------
+  /** \brief  This execution space has a topological thread pool which can be queried.
+   *
+   *  All threads within a pool have a common memory space for which they are cache coherent.
+   *    depth = 0  gives the number of threads in the whole pool.
+   *    depth = 1  gives the number of threads in a NUMA region, typically sharing L3 cache.
+   *    depth = 2  gives the number of threads at the finest granularity, typically sharing L1 cache.
+   */
+  inline static int thread_pool_size( int depth = 0 );
+
+  /** \brief  The rank of the executing thread in this thread pool */
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
+
+  //------------------------------------
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned hardware_thread_id() { return thread_pool_rank(); }
+
+  static const char* name();
+private:
+  static bool m_is_initialized;
+};
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Experimental::OpenMPTarget::memory_space
+  , Kokkos::Experimental::OpenMPTarget::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP ) */
+#endif /* #ifndef KOKKOS_OPENMPTARGET_HPP */
+
+
--- a/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
@ -0,0 +1,265 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPTARGETSPACE_HPP
+#define KOKKOS_OPENMPTARGETSPACE_HPP
+
+#include <cstring>
+#include <string>
+#include <iosfwd>
+#include <typeinfo>
+
+#include <Kokkos_Core_fwd.hpp>
+
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+
+#include <Kokkos_HostSpace.hpp>
+#include <omp.h>
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+//void init_lock_array_host_space();
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+//bool lock_address_host_space(void* ptr);
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+//void unlock_address_host_space(void* ptr);
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+/// \class OpenMPTargetSpace
+/// \brief Memory management for host memory.
+///
+/// OpenMPTargetSpace is a memory space that governs host memory.  "Host"
+/// memory means the usual CPU-accessible memory.
+class OpenMPTargetSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef OpenMPTargetSpace  memory_space ;
+  typedef size_t     size_type ;
+
+  /// \typedef execution_space
+  /// \brief Default execution space for this memory space.
+  ///
+  /// Every memory space has a default execution space.  This is
+  /// useful for things like initializing a View (which happens in
+  /// parallel using the View's default execution space).
+  typedef Kokkos::Experimental::OpenMPTarget   execution_space ;
+
+  //! This memory space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  /*--------------------------------*/
+
+  /**\brief  Default memory space instance */
+  OpenMPTargetSpace();
+  OpenMPTargetSpace( OpenMPTargetSpace && rhs ) = default ;
+  OpenMPTargetSpace( const OpenMPTargetSpace & rhs ) = default ;
+  OpenMPTargetSpace & operator = ( OpenMPTargetSpace && ) = default ;
+  OpenMPTargetSpace & operator = ( const OpenMPTargetSpace & ) = default ;
+  ~OpenMPTargetSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate( void * const arg_alloc_ptr 
+                 , const size_t arg_alloc_size ) const ;
+
+private:
+
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > ;
+};
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  friend Kokkos::Experimental::OpenMPTargetSpace ;
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  /**\brief  Root record for tracked allocations from this OpenMPTargetSpace instance */
+  static RecordBase s_root_record ;
+
+  const Kokkos::Experimental::OpenMPTargetSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() = default ;
+
+  SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const;
+
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const Kokkos::Experimental::OpenMPTargetSpace &  arg_space
+                                   , const std::string       &  arg_label
+                                   , const size_t               arg_alloc_size
+                                   );
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::Experimental::OpenMPTargetSpace & , bool detail = false );
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//TODO: implement all possible deep_copies
+template<class ExecutionSpace>
+struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,Kokkos::Experimental::OpenMPTargetSpace,ExecutionSpace> {
+  DeepCopy( void * dst , const void * src , size_t n ) {
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_default_device());
+  }
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+    exec.fence();
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_default_device());
+  }
+};
+
+
+template<class ExecutionSpace>
+struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,HostSpace,ExecutionSpace> {
+  DeepCopy( void * dst , const void * src , size_t n ) {
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_initial_device());
+  }
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+    exec.fence();
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_initial_device());
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy<HostSpace,Kokkos::Experimental::OpenMPTargetSpace,ExecutionSpace> {
+  DeepCopy( void * dst , const void * src , size_t n ) {
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_initial_device(), omp_get_default_device());
+  }
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+    exec.fence();
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_initial_device(), omp_get_default_device());
+  }
+};
+
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::OpenMPTargetSpace >
+{
+  enum { value = false };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif
+#endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
--- a/lib/kokkos/core/src/Kokkos_Qthreads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Qthreads.hpp
@ -44,9 +44,10 @@
 #ifndef KOKKOS_QTHREADS_HPP
 #define KOKKOS_QTHREADS_HPP

-#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS )

-#ifdef KOKKOS_ENABLE_QTHREADS
+#include <Kokkos_Core_fwd.hpp>

 // Defines to enable experimental Qthreads functionality.
 #define QTHREAD_LOCAL_PRIORITY
@ -150,6 +151,8 @@ public:

  int shepherd_size() const;
  int shepherd_worker_size() const;
+
+  static const char* name();
 };

 } // namespace Kokkos
@ -194,5 +197,5 @@ struct VerifyExecutionCanAccessMemorySpace
 //#include <Qthreads/Kokkos_Qthreads_TaskQueue.hpp> // Uncomment when Tasking working.

 #endif // #define KOKKOS_ENABLE_QTHREADS
-
 #endif // #define KOKKOS_QTHREADS_HPP
+
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -44,9 +44,9 @@
 #ifndef KOKKOS_SCRATCHSPACE_HPP
 #define KOKKOS_SCRATCHSPACE_HPP

-#include <stdio.h>
+#include <cstdio>
 #include <Kokkos_Core_fwd.hpp>
-#include <impl/Kokkos_Tags.hpp>
+#include <Kokkos_Concepts.hpp>

 /*--------------------------------------------------------------------------*/

@ -57,7 +57,7 @@ namespace Kokkos {
 */
 template< class ExecSpace >
 class ScratchMemorySpace {
-  static_assert (Impl::is_execution_space<ExecSpace>::value,"Instantiating ScratchMemorySpace on non-execution-space type.");
+  static_assert (is_execution_space<ExecSpace>::value,"Instantiating ScratchMemorySpace on non-execution-space type.");
 public:

  // Alignment of memory chunks returned by 'get'
@ -161,6 +161,3 @@ public:

 #endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */

-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -47,6 +47,9 @@
 #ifndef KOKKOS_SERIAL_HPP
 #define KOKKOS_SERIAL_HPP

+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_SERIAL )
+
 #include <cstddef>
 #include <iosfwd>
 #include <Kokkos_Parallel.hpp>
@ -63,7 +66,6 @@

 #include <KokkosExp_MDRangePolicy.hpp>

-#if defined( KOKKOS_ENABLE_SERIAL )

 namespace Kokkos {

@ -163,6 +165,7 @@ public:
  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }

+  static const char* name();
  //--------------------------------------------------------------------------
 };

@ -515,7 +518,7 @@ public:
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( reducer )
-    , m_result_ptr(  reducer.result_view().data() )
+    , m_result_ptr(  reducer.view().data() )
    {
      /*static_assert( std::is_same< typename ViewType::memory_space
                                      , Kokkos::HostSpace >::value
@ -798,7 +801,7 @@ public:
    : m_functor( arg_functor )
    , m_league(  arg_policy.league_size() )
    , m_reducer( reducer )
-    , m_result_ptr(  reducer.result_view().data() )
+    , m_result_ptr(  reducer.view().data() )
    , m_shared( arg_policy.scratch_size(0) +
                arg_policy.scratch_size(1) +
                FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
@ -821,5 +824,3 @@ public:
 #endif // defined( KOKKOS_ENABLE_SERIAL )
 #endif /* #define KOKKOS_SERIAL_HPP */

-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
@ -46,25 +46,10 @@

 //----------------------------------------------------------------------------

-#include <Kokkos_Core_fwd.hpp>
-
-// If compiling with CUDA then must be using CUDA 8 or better
-// and use relocateable device code to enable the task policy.
-// nvcc relocatable device code option: --relocatable-device-code=true
-
-#if ( defined( KOKKOS_ENABLE_CUDA ) )
-  #if ( 8000 <= CUDA_VERSION ) && \
-      defined( KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE )
-
-  #define KOKKOS_ENABLE_TASKDAG
-
-  #endif
-#else
-  #define KOKKOS_ENABLE_TASKDAG
-#endif
-
+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_TASKDAG )

+#include <Kokkos_Core_fwd.hpp>
 //----------------------------------------------------------------------------

 #include <Kokkos_MemoryPool.hpp>
@ -371,7 +356,7 @@ struct TaskPolicyData
  TaskPolicyData & operator = ( TaskPolicyData const & ) = default ;

  KOKKOS_INLINE_FUNCTION
-  TaskPolicyData( DepFutureType             && arg_future
+  TaskPolicyData( DepFutureType        const & arg_future
                , Kokkos::TaskPriority const & arg_priority )
    : m_scheduler( 0 )
    , m_dependence( arg_future )
@ -385,6 +370,15 @@ struct TaskPolicyData
    , m_dependence()
    , m_priority( static_cast<int>( arg_priority ) )
    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicyData( scheduler_type       const & arg_scheduler
+                , DepFutureType        const & arg_future
+                , Kokkos::TaskPriority const & arg_priority )
+    : m_scheduler( & arg_scheduler )
+    , m_dependence( arg_future )
+    , m_priority( static_cast<int>( arg_priority ) )
+    {}
 };

 } // namespace Impl
@ -413,6 +407,7 @@ public:

  using execution_space  = ExecSpace ;
  using memory_space     = typename queue_type::memory_space ;
+  using memory_pool      = typename queue_type::memory_pool ;
  using member_type      =
    typename Kokkos::Impl::TaskQueueSpecialization< ExecSpace >::member_type ;

@ -431,9 +426,7 @@ public:
  KOKKOS_INLINE_FUNCTION
  TaskScheduler & operator = ( TaskScheduler const & rhs ) = default ;

-  TaskScheduler( memory_space const & arg_memory_space
-               , unsigned const arg_memory_pool_capacity
-               , unsigned const arg_memory_pool_log2_superblock = 12 )
+  TaskScheduler( memory_pool const & arg_memory_pool )
    : m_track()
    , m_queue(0)
    {
@ -442,21 +435,37 @@ public:
          record_type ;

      record_type * record =
-        record_type::allocate( arg_memory_space
+        record_type::allocate( memory_space()
                             , "TaskQueue"
                             , sizeof(queue_type)
                             );

-      m_queue = new( record->data() )
-        queue_type( arg_memory_space
-                  , arg_memory_pool_capacity
-                  , arg_memory_pool_log2_superblock );
+      m_queue = new( record->data() ) queue_type( arg_memory_pool );

      record->m_destroy.m_queue = m_queue ;

      m_track.assign_allocated_record_to_uninitialized( record );
    }

+  TaskScheduler( memory_space const & arg_memory_space
+               , size_t const mempool_capacity
+               , unsigned const mempool_min_block_size  // = 1u << 6
+               , unsigned const mempool_max_block_size  // = 1u << 10
+               , unsigned const mempool_superblock_size // = 1u << 12
+               )
+    : TaskScheduler( memory_pool( arg_memory_space
+                                , mempool_capacity
+                                , mempool_min_block_size
+                                , mempool_max_block_size
+                                , mempool_superblock_size ) )
+    {}
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  memory_pool * memory() const noexcept
+    { return m_queue ? m_queue->m_memory : (memory_pool*) 0 ; }
+
  //----------------------------------------
  /**\brief  Allocation size for a spawned task */
  template< typename FunctorType >
@ -502,7 +511,12 @@ public:
          : (queue_type*) 0 );

      if ( 0 == queue ) {
-        Kokkos::abort("Kokkos spawn given null Future" );
+        Kokkos::abort("Kokkos spawn requires scheduler or non-null Future");
+      }
+
+      if ( arg_policy.m_dependence.m_task != 0 &&
+           arg_policy.m_dependence.m_task->m_queue != queue ) {
+        Kokkos::abort("Kokkos spawn given incompatible scheduler and Future");
      }

      //----------------------------------------
@ -641,7 +655,7 @@ public:

  KOKKOS_INLINE_FUNCTION
  int allocation_capacity() const noexcept
-    { return m_queue->m_memory.get_mem_size(); }
+    { return m_queue->m_memory.capacity(); }

  KOKKOS_INLINE_FUNCTION
  int allocated_task_count() const noexcept
@ -696,6 +710,22 @@ TaskTeam( T            const & arg
      >( arg , arg_priority );
 }

+template< typename E , typename F >
+Kokkos::Impl::
+  TaskPolicyData< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam , F >
+KOKKOS_INLINE_FUNCTION
+TaskTeam( TaskScheduler<E> const & arg_scheduler
+        , F                const & arg_future
+        , typename std::enable_if< Kokkos::is_future<F>::value ,
+            TaskPriority >::type const & arg_priority = TaskPriority::Regular
+        )
+{
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskTeam , F >
+        ( arg_scheduler , arg_future , arg_priority );
+}
+
 // Construct a TaskSingle execution policy

 template< typename T >
@ -721,6 +751,22 @@ TaskSingle( T            const & arg
      >( arg , arg_priority );
 }

+template< typename E , typename F >
+Kokkos::Impl::
+  TaskPolicyData< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle , F >
+KOKKOS_INLINE_FUNCTION
+TaskSingle( TaskScheduler<E> const & arg_scheduler
+          , F                const & arg_future
+          , typename std::enable_if< Kokkos::is_future<F>::value ,
+              TaskPriority >::type const & arg_priority = TaskPriority::Regular
+          )
+{
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskSingle , F >
+        ( arg_scheduler , arg_future , arg_priority );
+}
+
 //----------------------------------------------------------------------------

 /**\brief  A host control thread spawns a task with options
@ -849,3 +895,4 @@ void wait( TaskScheduler< ExecSpace > const & scheduler )

 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_TASKSCHEDULER_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@ -44,9 +44,10 @@
 #ifndef KOKKOS_THREADS_HPP
 #define KOKKOS_THREADS_HPP

-#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )

-#if defined( KOKKOS_ENABLE_PTHREAD )
+#include <Kokkos_Core_fwd.hpp>

 #include <cstddef>
 #include <iosfwd>
@ -178,6 +179,7 @@ public:
  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }

+  static const char* name();
  //@}
  //----------------------------------------
 };
@ -190,7 +192,7 @@ namespace Kokkos {
 namespace Impl {

 template<>
-struct MemorySpaceAccess 
+struct MemorySpaceAccess
  < Kokkos::Threads::memory_space
  , Kokkos::Threads::scratch_memory_space
  >
@ -227,6 +229,6 @@ struct VerifyExecutionCanAccessMemorySpace
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

-#endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) */
+#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
 #endif /* #define KOKKOS_THREADS_HPP */

--- a/lib/kokkos/core/src/Kokkos_Timer.hpp
+++ b/lib/kokkos/core/src/Kokkos_Timer.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -44,7 +44,7 @@
 #ifndef KOKKOS_TIMER_HPP
 #define KOKKOS_TIMER_HPP

-#include <stddef.h>
+#include <cstddef>

 #ifdef _MSC_VER
 #undef KOKKOS_ENABLE_LIBRT
@ -110,3 +110,4 @@ public:
 } // namespace Kokkos

 #endif /* #ifndef KOKKOS_TIMER_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Vectorization.hpp
+++ b/lib/kokkos/core/src/Kokkos_Vectorization.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -51,3 +51,4 @@
 #endif

 #endif
+
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@ -625,7 +625,7 @@ private:

 #define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( ARG ) \
  View::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); \
-  Kokkos::Impl::view_verify_operator_bounds ARG ;
+  Kokkos::Impl::view_verify_operator_bounds< typename traits::memory_space > ARG ;

 #else

@ -646,12 +646,7 @@ public:
                          ), reference_type >::type
  operator()( Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,args...) )
      return m_map.reference();
    }

@ -669,12 +664,7 @@ public:
  operator()( const I0 & i0
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
      return m_map.reference(i0);
    }

@ -690,13 +680,7 @@ public:
  operator()( const I0 & i0
            , Args ... args ) const
    {
-
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
      return m_map.m_handle[ i0 ];
    }

@ -712,12 +696,7 @@ public:
  operator()( const I0 & i0
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
      return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
    }

@ -733,12 +712,7 @@ public:
    ), reference_type >::type
  operator[]( const I0 & i0 ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
      return m_map.reference(i0);
    }

@ -752,12 +726,7 @@ public:
    ), reference_type >::type
  operator[]( const I0 & i0 ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
      return m_map.m_handle[ i0 ];
    }

@ -771,12 +740,7 @@ public:
    ), reference_type >::type
  operator[]( const I0 & i0 ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
      return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
    }

@ -794,12 +758,7 @@ public:
  operator()( const I0 & i0 , const I1 & i1
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
      return m_map.reference(i0,i1);
    }

@ -815,12 +774,7 @@ public:
  operator()( const I0 & i0 , const I1 & i1
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
      return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ];
    }

@ -836,12 +790,7 @@ public:
  operator()( const I0 & i0 , const I1 & i1
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
      return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ];
    }

@ -857,12 +806,7 @@ public:
  operator()( const I0 & i0 , const I1 & i1
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
      return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ];
    }

@ -878,12 +822,7 @@ public:
  operator()( const I0 & i0 , const I1 & i1
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
      return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ];
    }

@ -899,12 +838,7 @@ public:
  operator()( const I0 & i0 , const I1 & i1
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
      return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 +
                             i1 * m_map.m_offset.m_stride.S1 ];
    }
@ -923,12 +857,7 @@ public:
  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
      return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ];
    }

@ -943,12 +872,7 @@ public:
  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
      return m_map.reference(i0,i1,i2);
    }

@ -966,12 +890,7 @@ public:
  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ];
    }

@ -986,12 +905,7 @@ public:
  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
      return m_map.reference(i0,i1,i2,i3);
    }

@ -1011,12 +925,7 @@ public:
            , const I4 & i4
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ];
    }

@ -1033,12 +942,7 @@ public:
            , const I4 & i4
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
      return m_map.reference(i0,i1,i2,i3,i4);
    }

@ -1058,12 +962,7 @@ public:
            , const I4 & i4 , const I5 & i5
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ];
    }

@ -1080,12 +979,7 @@ public:
            , const I4 & i4 , const I5 & i5
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
      return m_map.reference(i0,i1,i2,i3,i4,i5);
    }

@ -1105,12 +999,7 @@ public:
            , const I4 & i4 , const I5 & i5 , const I6 & i6
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ];
    }

@ -1127,12 +1016,7 @@ public:
            , const I4 & i4 , const I5 & i5 , const I6 & i6
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
      return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
    }

@ -1152,12 +1036,7 @@ public:
            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ];
    }

@ -1174,12 +1053,7 @@ public:
            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
            , Args ... args ) const
    {
-      #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
-      #else
-        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
-      #endif
-
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
      return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
    }

--- a/lib/kokkos/core/src/Makefile
+++ b/lib/kokkos/core/src/Makefile
@ -9,14 +9,14 @@ default: messages build-lib
 	echo "End Build"

 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-  CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
 else
  CXX = g++
 endif

 CXXFLAGS = -O3
 LINK ?= $(CXX)
-LDFLAGS ?= 
+LDFLAGS ?=

 include $(KOKKOS_PATH)/Makefile.kokkos

@ -66,7 +66,7 @@ else
  KOKKOS_DEBUG_CMAKE = ON
 endif

-messages: 
+messages:
 	echo "Start Build"

 build-makefile-kokkos:
@ -80,13 +80,13 @@ build-makefile-kokkos:
 	echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos
 	echo "KOKKOS_OPTIONS = $(KOKKOS_OPTIONS)" >> Makefile.kokkos
 	echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos
-	echo "CXX ?= $(CXX)" >> Makefile.kokkos 
+	echo "CXX ?= $(CXX)" >> Makefile.kokkos
 	echo "NVCC_WRAPPER ?= $(PREFIX)/bin/nvcc_wrapper" >> Makefile.kokkos
-	echo "" >> Makefile.kokkos  
+	echo "" >> Makefile.kokkos
 	echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos
 	echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos
 	echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
-	echo "" >> Makefile.kokkos  
+	echo "" >> Makefile.kokkos
 	echo "#Variables used in application Makefiles" >> Makefile.kokkos
 	echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
 	echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
@ -133,11 +133,11 @@ build-cmake-kokkos:
 	echo "else()" >> kokkos.cmake
 	echo '  set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")' >> kokkos.cmake
 	echo "endif()" >> kokkos.cmake
-	echo "" >> kokkos.cmake  
+	echo "" >> kokkos.cmake
 	echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> kokkos.cmake
 	echo "set(KOKKOS_HEADERS \"$(KOKKOS_HEADERS)\" CACHE STRING \"Kokkos headers list\")" >> kokkos.cmake
 	echo "set(KOKKOS_SRC \"$(KOKKOS_SRC)\" CACHE STRING \"Kokkos source list\")" >> kokkos.cmake
-	echo "" >> kokkos.cmake  
+	echo "" >> kokkos.cmake
 	echo "#Variables used in application Makefiles" >> kokkos.cmake
 	echo "set(KOKKOS_CPP_DEPENDS \"$(KOKKOS_CPP_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
 	echo "set(KOKKOS_CXXFLAGS \"$(KOKKOS_CXXFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
@ -164,7 +164,7 @@ build-cmake-kokkos:

 build-lib: build-makefile-kokkos build-cmake-kokkos $(KOKKOS_LINK_DEPENDS)

-mkdir: 
+mkdir:
 	mkdir -p $(PREFIX)
 	mkdir -p $(PREFIX)/bin
 	mkdir -p $(PREFIX)/include
@ -187,7 +187,7 @@ copy-openmp: mkdir
 	mkdir -p $(PREFIX)/include/OpenMP
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP

-install: mkdir $(CONDITIONAL_COPIES) build-lib 
+install: mkdir $(CONDITIONAL_COPIES) build-lib
 	cp $(COPY_FLAG) $(NVCC_WRAPPER) $(PREFIX)/bin
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@ -41,7 +41,10 @@
 //@HEADER
 */

-#include <stdio.h>
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP )
+
+#include <cstdio>
 #include <limits>
 #include <iostream>
 #include <vector>
@ -51,7 +54,6 @@
 #include <impl/Kokkos_CPUDiscovery.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>

-#ifdef KOKKOS_ENABLE_OPENMP

 namespace Kokkos {
 namespace Impl {
@ -82,13 +84,13 @@ bool s_using_hwloc = false;
 namespace Kokkos {
 namespace Impl {

-int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+int OpenMPExec::m_map_rank[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };

-int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
+int OpenMPExec::m_pool_topo[ 4 ] = { 0 };

-HostThreadTeamData * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+HostThreadTeamData * OpenMPExec::m_pool[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };

-void OpenMPexec::verify_is_process( const char * const label )
+void OpenMPExec::verify_is_process( const char * const label )
 {
  if ( omp_in_parallel() ) {
    std::string msg( label );
@ -97,7 +99,7 @@ void OpenMPexec::verify_is_process( const char * const label )
  }
 }

-void OpenMPexec::verify_initialized( const char * const label )
+void OpenMPExec::verify_initialized( const char * const label )
 {
  if ( 0 == m_pool[0] ) {
    std::string msg( label );
@ -122,7 +124,7 @@ void OpenMPexec::verify_initialized( const char * const label )
 namespace Kokkos {
 namespace Impl {

-void OpenMPexec::clear_thread_data()
+void OpenMPExec::clear_thread_data()
 {
  const size_t member_bytes =
    sizeof(int64_t) *
@ -149,7 +151,7 @@ void OpenMPexec::clear_thread_data()
 /* END #pragma omp parallel */
 }

-void OpenMPexec::resize_thread_data( size_t pool_reduce_bytes
+void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
                                   , size_t team_reduce_bytes
                                   , size_t team_shared_bytes
                                   , size_t thread_local_bytes )
@ -231,7 +233,7 @@ namespace Kokkos {
 //----------------------------------------------------------------------------

 int OpenMP::is_initialized()
-{ return 0 != Impl::OpenMPexec::m_pool[0]; }
+{ return 0 != Impl::OpenMPExec::m_pool[0]; }

 void OpenMP::initialize( unsigned thread_count ,
                         unsigned use_numa_count ,
@ -257,7 +259,7 @@ void OpenMP::initialize( unsigned thread_count ,

  static int omp_max_threads = nthreads;

-  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
+  const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;

  bool thread_spawn_failed = false ;

@ -270,7 +272,7 @@ void OpenMP::initialize( unsigned thread_count ,
                            ( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
                            ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );

-    std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
+    std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPExec::MAX_THREAD_COUNT ];

    // If hwloc available then use it's maximum value.

@ -316,16 +318,16 @@ void OpenMP::initialize( unsigned thread_count ,
                                   ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
                                   : omp_rank ;

-        Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
+        Impl::OpenMPExec::m_map_rank[ omp_rank ] = thread_r ;
      }
 /* END #pragma omp critical */
    }
 /* END #pragma omp parallel */

    if ( ! thread_spawn_failed ) {
-      Impl::OpenMPexec::m_pool_topo[0] = thread_count ;
-      Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
-      Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
+      Impl::OpenMPExec::m_pool_topo[0] = thread_count ;
+      Impl::OpenMPExec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
+      Impl::OpenMPExec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;

      // New, unified host thread team data:
      {
@ -334,7 +336,7 @@ void OpenMP::initialize( unsigned thread_count ,
        size_t team_shared_bytes  = 1024 * thread_count ;
        size_t thread_local_bytes = 1024 ;

-        Impl::OpenMPexec::resize_thread_data( pool_reduce_bytes
+        Impl::OpenMPExec::resize_thread_data( pool_reduce_bytes
                                            , team_reduce_bytes
                                            , team_shared_bytes
                                            , thread_local_bytes
@ -371,15 +373,15 @@ void OpenMP::initialize( unsigned thread_count ,

 void OpenMP::finalize()
 {
-  Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
-  Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
+  Impl::OpenMPExec::verify_initialized( "OpenMP::finalize" );
+  Impl::OpenMPExec::verify_is_process( "OpenMP::finalize" );

  // New, unified host thread team data:
-  Impl::OpenMPexec::clear_thread_data();
+  Impl::OpenMPExec::clear_thread_data();

-  Impl::OpenMPexec::m_pool_topo[0] = 0 ;
-  Impl::OpenMPexec::m_pool_topo[1] = 0 ;
-  Impl::OpenMPexec::m_pool_topo[2] = 0 ;
+  Impl::OpenMPExec::m_pool_topo[0] = 0 ;
+  Impl::OpenMPExec::m_pool_topo[1] = 0 ;
+  Impl::OpenMPExec::m_pool_topo[2] = 0 ;

  omp_set_num_threads(1);

@ -396,7 +398,7 @@ void OpenMP::finalize()

 void OpenMP::print_configuration( std::ostream & s , const bool detail )
 {
-  Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" );
+  Impl::OpenMPExec::verify_is_process( "OpenMP::print_configuration" );

  s << "Kokkos::OpenMP" ;

@ -414,12 +416,12 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
    ;
 #endif

-  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
+  const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;

  if ( is_initialized ) {
-    const int numa_count      = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ;
-    const int core_per_numa   = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
-    const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
+    const int numa_count      = Kokkos::Impl::OpenMPExec::m_pool_topo[0] / Kokkos::Impl::OpenMPExec::m_pool_topo[1] ;
+    const int core_per_numa   = Kokkos::Impl::OpenMPExec::m_pool_topo[1] / Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
+    const int thread_per_core = Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;

    s << " thread_pool_topology[ " << numa_count
      << " x " << core_per_numa
@ -428,7 +430,7 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
      << std::endl ;

    if ( detail ) {
-      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] );
+      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPExec::m_pool_topo[0] );

 #pragma omp parallel
      {
@ -442,7 +444,7 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )

      for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
        s << "  thread omp_rank[" << i << "]"
-          << " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]"
+          << " kokkos_rank[" << Impl::OpenMPExec::m_map_rank[ i ] << "]"
          << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
          << std::endl ;
      }
@ -457,6 +459,11 @@ int OpenMP::concurrency() {
  return thread_pool_size(0);
 }

+const char* OpenMP::name() { return "OpenMP"; }
+
 } // namespace Kokkos

+#else
+void KOKKOS_CORE_SRC_OPENMP_EXEC_PREVENT_LINK_ERROR() {}
 #endif //KOKKOS_ENABLE_OPENMP
+
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@ -44,6 +44,9 @@
 #ifndef KOKKOS_OPENMPEXEC_HPP
 #define KOKKOS_OPENMPEXEC_HPP

+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP )
+
 #include <Kokkos_OpenMP.hpp>

 #include <impl/Kokkos_Traits.hpp>
@ -66,7 +69,7 @@ namespace Impl {
 //----------------------------------------------------------------------------
 /** \brief  Data for OpenMP thread execution */

-class OpenMPexec {
+class OpenMPExec {
 public:

  friend class Kokkos::OpenMP ;
@ -324,17 +327,21 @@ public:

 namespace Kokkos {

+inline
+bool OpenMP::in_parallel()
+{ return omp_in_parallel(); }
+
 inline
 int OpenMP::thread_pool_size( int depth )
 {
-  return Impl::OpenMPexec::pool_size(depth);
+  return Impl::OpenMPExec::pool_size(depth);
 }

 KOKKOS_INLINE_FUNCTION
 int OpenMP::thread_pool_rank()
 {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-  return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ];
+  return Impl::OpenMPExec::m_map_rank[ omp_get_thread_num() ];
 #else
  return -1 ;
 #endif
@ -342,4 +349,6 @@ int OpenMP::thread_pool_rank()

 } // namespace Kokkos

+#endif
 #endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
+
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -44,9 +44,12 @@
 #ifndef KOKKOS_OPENMP_PARALLEL_HPP
 #define KOKKOS_OPENMP_PARALLEL_HPP

+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP )
+
 #include <omp.h>
 #include <iostream>
-#include <OpenMP/Kokkos_OpenMPexec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>

 //----------------------------------------------------------------------------
@ -58,7 +61,7 @@ namespace Impl {
 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType
                 , Kokkos::RangePolicy< Traits ... >
-                 , Kokkos::OpenMP 
+                 , Kokkos::OpenMP
                 >
 {
 private:
@ -77,7 +80,7 @@ private:
  exec_range( const FunctorType & functor
            , const Member ibeg , const Member iend )
    {
-      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
      #pragma ivdep
      #endif
@ -94,7 +97,7 @@ private:
            , const Member ibeg , const Member iend )
    {
      const TagType t{} ;
-      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
      #pragma ivdep
      #endif
@ -111,12 +114,12 @@ public:
      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
                                      , Kokkos::Dynamic >::value };

-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
+      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");

 #pragma omp parallel
      {
-        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+        HostThreadTeamData & data = *OpenMPExec::get_thread_data();

        data.set_work_partition( m_policy.end() - m_policy.begin()
                               , m_policy.chunk_size() );
@ -200,11 +203,6 @@ private:
            , const Member ibeg , const Member iend
            , reference_type update )
    {
-      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
-      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-      #pragma ivdep
-      #endif
-      #endif
      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
        functor( iwork , update );
      }
@ -218,11 +216,6 @@ private:
            , reference_type update )
    {
      const TagType t{} ;
-      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
-      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-      #pragma ivdep
-      #endif
-      #endif
      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
        functor( t , iwork , update );
      }
@ -235,13 +228,13 @@ public:
      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
                                      , Kokkos::Dynamic >::value };

-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");

      const size_t pool_reduce_bytes =
        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));

-      OpenMPexec::resize_thread_data( pool_reduce_bytes
+      OpenMPExec::resize_thread_data( pool_reduce_bytes
                                    , 0 // team_reduce_bytes
                                    , 0 // team_shared_bytes
                                    , 0 // thread_local_bytes
@ -249,7 +242,7 @@ public:

 #pragma omp parallel
      {
-        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+        HostThreadTeamData & data = *OpenMPExec::get_thread_data();

        data.set_work_partition( m_policy.end() - m_policy.begin()
                               , m_policy.chunk_size() );
@ -282,12 +275,12 @@ public:

      // Reduction:

-      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
+      const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );

-      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+      for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
                       , ptr
-                       , OpenMPexec::get_thread_data(i)->pool_reduce_local() );
+                       , OpenMPExec::get_thread_data(i)->pool_reduce_local() );
      }

      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
@ -305,7 +298,7 @@ public:
  inline
  ParallelReduce( const FunctorType & arg_functor
                , Policy       arg_policy
-                , const ViewType    & arg_result_view
+                , const ViewType    & arg_view
                , typename std::enable_if<
                           Kokkos::is_view< ViewType >::value &&
                           !Kokkos::is_reducer_type<ReducerType>::value
@ -313,7 +306,7 @@ public:
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( InvalidType() )
-    , m_result_ptr(  arg_result_view.data() )
+    , m_result_ptr(  arg_view.data() )
    {
      /*static_assert( std::is_same< typename ViewType::memory_space
                                      , Kokkos::HostSpace >::value
@ -327,7 +320,7 @@ public:
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( reducer )
-    , m_result_ptr(  reducer.result_view().data() )
+    , m_result_ptr(  reducer.view().data() )
    {
      /*static_assert( std::is_same< typename ViewType::memory_space
                                      , Kokkos::HostSpace >::value
@ -378,11 +371,6 @@ private:
            , const Member ibeg , const Member iend
            , reference_type update , const bool final )
    {
-      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
-      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-      #pragma ivdep
-      #endif
-      #endif
      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
        functor( iwork , update , final );
      }
@ -396,11 +384,6 @@ private:
            , reference_type update , const bool final )
    {
      const TagType t{} ;
-      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
-      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-      #pragma ivdep
-      #endif
-      #endif
      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
        functor( t , iwork , update , final );
      }
@ -411,13 +394,13 @@ public:
  inline
  void execute() const
    {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
+      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_scan");
+      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_scan");

      const int    value_count       = Analysis::value_count( m_functor );
      const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );

-      OpenMPexec::resize_thread_data( pool_reduce_bytes
+      OpenMPExec::resize_thread_data( pool_reduce_bytes
                                    , 0 // team_reduce_bytes
                                    , 0 // team_shared_bytes
                                    , 0 // thread_local_bytes
@ -425,7 +408,7 @@ public:

 #pragma omp parallel
      {
-        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+        HostThreadTeamData & data = *OpenMPExec::get_thread_data();

        const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );

@ -565,22 +548,22 @@ public:
    {
      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };

-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
+      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");

      const size_t pool_reduce_size = 0 ; // Never shrinks
      const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
      const size_t thread_local_size = 0 ; // Never shrinks

-      OpenMPexec::resize_thread_data( pool_reduce_size
+      OpenMPExec::resize_thread_data( pool_reduce_size
                                    , team_reduce_size
                                    , team_shared_size
                                    , thread_local_size );

 #pragma omp parallel
      {
-        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+        HostThreadTeamData & data = *OpenMPExec::get_thread_data();

        const int active = data.organize_team( m_policy.team_size() );

@ -723,8 +706,8 @@ public:
    {
      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };

-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");

      const size_t pool_reduce_size =
        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
@ -733,14 +716,14 @@ public:
      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
      const size_t thread_local_size = 0 ; // Never shrinks

-      OpenMPexec::resize_thread_data( pool_reduce_size
+      OpenMPExec::resize_thread_data( pool_reduce_size
                                    , team_reduce_size
                                    , team_shared_size
                                    , thread_local_size );

 #pragma omp parallel
      {
-        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+        HostThreadTeamData & data = *OpenMPExec::get_thread_data();

        const int active = data.organize_team( m_policy.team_size() );

@ -785,12 +768,12 @@ public:

      // Reduction:

-      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
+      const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );

-      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+      for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
                       , ptr
-                       , OpenMPexec::get_thread_data(i)->pool_reduce_local() );
+                       , OpenMPExec::get_thread_data(i)->pool_reduce_local() );
      }

      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
@ -830,7 +813,7 @@ public:
  : m_functor( arg_functor )
  , m_policy(  arg_policy )
  , m_reducer( reducer )
-  , m_result_ptr(  reducer.result_view().data() )
+  , m_result_ptr(  reducer.view().data() )
  , m_shmem_size( arg_policy.scratch_size(0) +
                  arg_policy.scratch_size(1) +
                  FunctorTeamShmemSize< FunctorType >
@ -849,5 +832,6 @@ public:
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

+#endif
 #endif /* KOKKOS_OPENMP_PARALLEL_HPP */

--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,15 +36,16 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

-#include <Kokkos_Core.hpp>
-
+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )

+#include <Kokkos_Core.hpp>
+
 #include <impl/Kokkos_TaskQueue_impl.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>

@ -110,21 +111,27 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
  static task_root_type * const end =
    (task_root_type *) task_root_type::EndTag ;

+
  HostThreadTeamData & team_data_single =
    HostThreadTeamDataSingleton::singleton();

-  const int team_size = Impl::OpenMPexec::pool_size(2); // Threads per core
-  // const int team_size = Impl::OpenMPexec::pool_size(1); // Threads per NUMA
+  const int team_size = Impl::OpenMPExec::pool_size(2); // Threads per core
+  // const int team_size = Impl::OpenMPExec::pool_size(1); // Threads per NUMA

 #if 0
 fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
 fflush(stdout);
 #endif

+  OpenMPExec::resize_thread_data( 0 /* global reduce buffer */
+                                , 512 * team_size /* team reduce buffer */
+                                , 0 /* team shared buffer */
+                                , 0 /* thread local buffer */
+                                );

 #pragma omp parallel
  {
-    Impl::HostThreadTeamData & self = *Impl::OpenMPexec::get_thread_data();
+    Impl::HostThreadTeamData & self = *Impl::OpenMPExec::get_thread_data();

    // Organizing threads into a team performs a barrier across the
    // entire pool to insure proper initialization of the team
@ -164,7 +171,7 @@ fflush(stdout);
            if ( 0 != task && end != task ) {
              // team member #0 completes the previously executed task,
              // completion may delete the task
-              queue->complete( task ); 
+              queue->complete( task );
            }

            // If 0 == m_ready_count then set task = 0
@ -301,7 +308,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::

      (*task->m_apply)( task , & single_exec );

-      queue->complete( task ); 
+      queue->complete( task );

    } while(1);
  }
@ -310,7 +317,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
 }} /* namespace Kokkos::Impl */

 //----------------------------------------------------------------------------
-
+#else
+void KOKKOS_CORE_SRC_OPENMP_KOKKOS_OPENMP_TASK_PREVENT_LINK_ERROR() {}
 #endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG ) */

-
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -44,6 +44,7 @@
 #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
 #define KOKKOS_IMPL_OPENMP_TASK_HPP

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_TASKDAG )

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
@ -0,0 +1,306 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <algorithm>
+#include <omp.h>
+#include <Kokkos_Macros.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include <Kokkos_OpenMPTargetSpace.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+
+namespace Kokkos {
+namespace Experimental {
+/* Default allocation mechanism */
+OpenMPTargetSpace::OpenMPTargetSpace()
+{}
+
+void * OpenMPTargetSpace::allocate( const size_t arg_alloc_size ) const
+{
+  static_assert( sizeof(void*) == sizeof(uintptr_t)
+               , "Error sizeof(void*) != sizeof(uintptr_t)" );
+
+  void * ptr;
+  
+  ptr = omp_target_alloc( arg_alloc_size, omp_get_default_device());
+
+  return ptr;
+}
+
+
+void OpenMPTargetSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
+{
+  if ( arg_alloc_ptr ) {
+
+      omp_target_free( arg_alloc_ptr , omp_get_default_device() );
+
+  }
+}
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::s_root_record ;
+
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+//TODO: Implement deep copy back see CudaSpace 
+std::string
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_label() const {
+  return std::string("OpenMPTargetAllocation");
+}
+
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >* 
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+allocate( const Kokkos::Experimental::OpenMPTargetSpace & arg_space,
+          const std::string               & arg_label ,
+          const size_t                      arg_alloc_size) {
+  return new SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >(arg_space,arg_label,arg_alloc_size);
+}
+
+void
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+deallocate( SharedAllocationRecord<void, void> * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  SharedAllocationHeader header;
+
+  header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( header.m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+  
+  //TODO DeepCopy
+  // DeepCopy
+
+}
+
+//----------------------------------------------------------------------------
+
+void * SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+allocate_tracked( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
+                , const std::string & arg_alloc_label 
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  //Kokkos::Impl::DeepCopy<OpenMPTargetSpace,OpenMPTargetSpace>( r_new->data() , r_old->data()
+  //                                           , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_record( void * alloc_ptr )
+{
+  typedef SharedAllocationHeader  Header ;
+  typedef SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >  RecordHost ;
+
+  SharedAllocationHeader const * const head   = alloc_ptr ? Header::get_header( alloc_ptr ) : (SharedAllocationHeader *)0 ;
+  RecordHost                   * const record = head ? static_cast< RecordHost * >( head->m_record ) : (RecordHost *) 0 ;
+
+  if ( ! alloc_ptr || record->m_alloc_ptr != head ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_record ERROR" ) );
+  }
+
+  return record ;
+}
+
+// Iterate records to print orphaned memory ...
+void SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+print_records( std::ostream & s , const Kokkos::Experimental::OpenMPTargetSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "OpenMPTargetSpace" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template< class >
+struct ViewOperatorBoundsErrorAbort ;
+
+template<>
+struct ViewOperatorBoundsErrorAbort< Kokkos::Experimental::OpenMPTargetSpace > {
+ static void apply( const size_t rank
+                  , const size_t n0 , const size_t n1
+                  , const size_t n2 , const size_t n3
+                  , const size_t n4 , const size_t n5
+                  , const size_t n6 , const size_t n7
+                  , const size_t i0 , const size_t i1
+                  , const size_t i2 , const size_t i3
+                  , const size_t i4 , const size_t i5
+                  , const size_t i6 , const size_t i7 );
+};
+
+void ViewOperatorBoundsErrorAbort< Kokkos::Experimental::OpenMPTargetSpace >::
+apply( const size_t rank
+     , const size_t n0 , const size_t n1
+     , const size_t n2 , const size_t n3
+     , const size_t n4 , const size_t n5
+     , const size_t n6 , const size_t n7
+     , const size_t i0 , const size_t i1
+     , const size_t i2 , const size_t i3
+     , const size_t i4 , const size_t i5
+     , const size_t i6 , const size_t i7 )
+{
+  printf( "View operator bounds error : rank(%lu) dim(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu) index(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu)"
+          , rank , n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7
+                 , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+  //Kokkos::Impl::throw_runtime_exception( buffer );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/*
+namespace Kokkos {
+namespace {
+  const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
+  const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
+}
+
+namespace Impl {
+void init_lock_array_host_space() {
+  static int is_initialized = 0;
+  if(! is_initialized)
+    for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
+      HOST_SPACE_ATOMIC_LOCKS[i] = 0;
+}
+
+bool lock_address_host_space(void* ptr) {
+  return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                                  0 , 1);
+}
+
+void unlock_address_host_space(void* ptr) {
+   atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                    0);
+}
+
+}
+}*/
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@ -0,0 +1,273 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <limits>
+#include <iostream>
+#include <vector>
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <iostream>
+#include <impl/Kokkos_CPUDiscovery.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel();
+
+int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel()
+{
+#ifndef __CUDA_ARCH__
+  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
+#else
+  return 0;
+#endif
+}
+
+bool s_using_hwloc = false;
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+namespace Experimental {
+bool OpenMPTarget::m_is_initialized = false;
+}
+}
+
+namespace Kokkos {
+namespace Impl {
+
+
+//int OpenMPTargetExec::m_map_rank[ OpenMPTargetExec::MAX_THREAD_COUNT ] = { 0 };
+
+//int OpenMPTargetExec::m_pool_topo[ 4 ] = { 0 };
+
+//OpenMPTargetExec * OpenMPTargetExec::m_pool[ OpenMPTargetExec::MAX_THREAD_COUNT ] = { 0 };
+
+void OpenMPTargetExec::verify_is_process( const char * const label )
+{
+  if ( omp_in_parallel() ) {
+    std::string msg( label );
+    msg.append( " ERROR: in parallel" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPTargetExec::verify_initialized( const char * const label )
+{
+  if ( 0 == Kokkos::Experimental::OpenMPTarget::is_initialized() ) {
+    std::string msg( label );
+    msg.append( " ERROR: not initialized" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+  if ( omp_get_max_threads() != Kokkos::Experimental::OpenMPTarget::thread_pool_size(0) ) {
+    std::string msg( label );
+    msg.append( " ERROR: Initialized but threads modified inappropriately" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+}
+
+void*    OpenMPTargetExec::m_scratch_ptr  = NULL;
+int64_t OpenMPTargetExec::m_scratch_size = 0;
+
+void OpenMPTargetExec::clear_scratch()
+{
+  Kokkos::Experimental::OpenMPTargetSpace space;
+  space.deallocate(m_scratch_ptr,m_scratch_size);
+  m_scratch_ptr = NULL;
+  m_scratch_size = NULL;
+}
+
+void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; }
+
+void OpenMPTargetExec::resize_scratch( int64_t reduce_bytes , 
+                                       int64_t team_reduce_bytes, 
+                                       int64_t team_shared_bytes, int64_t thread_local_bytes) 
+{
+  Kokkos::Experimental::OpenMPTargetSpace space;
+  uint64_t total_size = MAX_ACTIVE_TEAMS * reduce_bytes +            // Inter Team Reduction  
+                        MAX_ACTIVE_TEAMS * team_reduce_bytes  +    // Intra Team Reduction
+                        MAX_ACTIVE_TEAMS * team_shared_bytes +       // Team Local Scratch
+                        MAX_ACTIVE_THREADS * thread_local_bytes;     // Thread Private Scratch
+
+  if( total_size > m_scratch_size ) {
+    space.deallocate(m_scratch_ptr,m_scratch_size);
+    m_scratch_size = total_size;
+    m_scratch_ptr = space.allocate(total_size);
+  }
+
+}
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+//----------------------------------------------------------------------------
+
+int OpenMPTarget::is_initialized()
+{ return m_is_initialized; }// != Impl::OpenMPTargetExec::m_pool[0]; }
+
+void OpenMPTarget::initialize( unsigned thread_count ,
+                         unsigned use_numa_count ,
+                         unsigned use_cores_per_numa )
+{
+  // Before any other call to OMP query the maximum number of threads
+  // and save the value for re-initialization unit testing.
+
+
+  // Init the array for used for arbitrarily sized atomics
+  Kokkos::Impl::init_lock_array_host_space();
+
+  #ifdef KOKKOS_ENABLE_PROFILING
+    Kokkos::Profiling::initialize();
+  #endif
+  m_is_initialized = true;
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMPTarget::finalize()
+{
+  Kokkos::Impl::OpenMPTargetExec::verify_initialized( "OpenMPTarget::finalize" );
+  Kokkos::Impl::OpenMPTargetExec::verify_is_process( "OpenMPTarget::finalize" );
+
+  m_is_initialized = false;
+
+  omp_set_num_threads(1);
+
+  if ( Kokkos::Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
+    hwloc::unbind_this_thread();
+  }
+
+  #ifdef KOKKOS_ENABLE_PROFILING
+    Kokkos::Profiling::finalize();
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMPTarget::print_configuration( std::ostream & s , const bool detail )
+{
+  Kokkos::Impl::OpenMPTargetExec::verify_is_process( "OpenMPTarget::print_configuration" );
+/*
+  s << "Kokkos::Experimental::OpenMPTarget" ;
+
+#if defined( KOKKOS_ENABLE_OPENMPTARGET )
+  s << " KOKKOS_ENABLE_OPENMPTARGET" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+
+  const unsigned numa_count_       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
+    << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
+    ;
+#endif
+
+  const bool is_initialized = 0 != Impl::OpenMPTargetExec::m_pool[0] ;
+
+  if ( is_initialized ) {
+    const int numa_count      = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[0] / Kokkos::Impl::OpenMPTargetExec::m_pool_topo[1] ;
+    const int core_per_numa   = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[1] / Kokkos::Impl::OpenMPTargetExec::m_pool_topo[2] ;
+    const int thread_per_core = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[2] ;
+
+    s << " thread_pool_topology[ " << numa_count
+      << " x " << core_per_numa
+      << " x " << thread_per_core
+      << " ]"
+      << std::endl ;
+
+    if ( detail ) {
+      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPTargetExec::m_pool_topo[0] );
+
+#pragma omp parallel
+      {
+#pragma omp critical
+        {
+          coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
+        }
+// END #pragma omp critical 
+      }
+// END #pragma omp parallel 
+
+      for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
+        s << "  thread omp_rank[" << i << "]"
+          << " kokkos_rank[" << Impl::OpenMPTargetExec::m_map_rank[ i ] << "]"
+          << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
+          << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+*/
+}
+
+int OpenMPTarget::concurrency() {
+  return thread_pool_size(0);
+}
+
+const char* OpenMPTarget::name() { return "OpenMPTarget"; }
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif //KOKKOS_ENABLE_OPENMPTARGET
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
@ -0,0 +1,727 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPTARGETEXEC_HPP
+#define KOKKOS_OPENMPTARGETEXEC_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+#include <Kokkos_Atomic.hpp>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Data for OpenMPTarget thread execution */
+
+
+class OpenMPTargetExec {
+public: 
+  enum { MAX_ACTIVE_THREADS = 256*8*56*4 };
+  enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 };
+  
+private:
+  static void* scratch_ptr;
+
+public:
+  static void verify_is_process( const char * const );
+  static void verify_initialized( const char * const );
+
+  static void* get_scratch_ptr();  
+  static void clear_scratch();
+  static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes );
+
+  static void* m_scratch_ptr;
+  static int64_t m_scratch_size;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class OpenMPTargetExecTeamMember {
+public:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  /** \brief  Thread states for team synchronization */
+  enum { Active = 0 , Rendezvous = 1 };
+
+  typedef Kokkos::Experimental::OpenMPTarget                         execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  scratch_memory_space  m_team_shared ;
+  int                   m_team_scratch_size[2] ;
+  int                   m_team_rank ;
+  int                   m_team_size ;
+  int                   m_league_rank ;
+  int                   m_league_size ;
+  int                   m_vector_length ;
+  int                   m_vector_lane ;
+  void* 		m_glb_scratch ;
+
+  /*
+  // Fan-in team threads, root of the fan-in which does not block returns true
+  inline
+  bool team_fan_in() const
+    {
+      memory_fence();
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+
+        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
+      }
+
+      if ( m_team_rank_rev ) {
+        m_exec.state_set( Rendezvous );
+        memory_fence();
+        m_exec.state_wait( Rendezvous );
+      }
+
+      return 0 == m_team_rank_rev ;
+    }
+
+  inline
+  void team_fan_out() const
+    {
+      memory_fence();
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+        m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
+        memory_fence();
+      }
+    }
+  */
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& team_shmem() const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& team_scratch(int) const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& thread_scratch(int) const
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+    {
+      #pragma omp barrier  
+    }
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(ValueType& value, const int& thread_id) const
+  {
+/*#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { }
+#else
+    // Make sure there is enough scratch space:
+    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
+                         , ValueType , void >::type type ;
+
+    type * const local_value = ((type*) m_exec.scratch_thread());
+    if(team_rank() == thread_id)
+      *local_value = value;
+    memory_fence();
+    team_barrier();
+    value = *local_value;
+#endif*/
+  }
+
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION ValueType
+    team_reduce( const ValueType & value
+               , const JoinOp & op_in ) const {
+
+      #pragma omp barrier
+
+      typedef ValueType value_type;
+      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
+      
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type);
+      type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num()); 
+      for(int i = m_team_rank; i < n_values; i+= m_team_size) {
+        team_scratch[i] = value_type();
+      }
+
+      #pragma omp barrier
+
+      for(int k=0; k<m_team_size; k+=n_values) {
+        if((k <= m_team_rank) && (k+n_values > m_team_rank))
+          team_scratch[m_team_rank%n_values]+=value;
+        #pragma omp barrier
+      }
+      
+      for(int d = 1; d<n_values;d*=2) {
+        if((m_team_rank+d<n_values) && (m_team_rank%(2*d)==0)) {
+          team_scratch[m_team_rank] += team_scratch[m_team_rank+d];
+        }
+        #pragma omp barrier
+      }
+      return team_scratch[0];
+    }
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+    {
+    /*  // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
+
+      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+          const type offset = accum ;
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;*/
+      return ArgType();
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+    { return this-> template team_scan<Type>( value , 0 ); }
+
+  //----------------------------------------
+  // Private for the driver
+
+private:
+
+  typedef execution_space::scratch_memory_space space ;
+
+public:
+
+  inline
+  OpenMPTargetExecTeamMember( const int league_rank, const int league_size, const int team_size, const int vector_length //const TeamPolicyInternal< OpenMPTarget, Properties ...> & team
+                      , void* const glb_scratch
+                      , const int shmem_size_L1
+                      , const int shmem_size_L2
+                      )
+    : m_team_shared(0,0)
+    , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
+    , m_team_rank(0)
+    , m_vector_length( vector_length )
+    , m_team_size( team_size )
+    , m_league_rank( league_rank )
+    , m_league_size( league_size )
+    , m_glb_scratch( glb_scratch )
+    {
+      const int omp_tid = omp_get_thread_num();
+      m_league_rank = league_rank;
+      m_team_rank = omp_tid/m_vector_length;
+      m_vector_lane = omp_tid%m_vector_length;
+    }
+
+  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+};
+
+
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... >: public PolicyTraits<Properties ...>
+{
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_vector_length = p.m_vector_length;
+    m_team_alloc = p.m_team_alloc;
+    m_team_iter = p.m_team_iter;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return 1024; }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & )
+    { return 256; }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& vector_length)
+    { return 256/vector_length; }
+
+  //----------------------------------------
+
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_vector_length;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+
+  int m_chunk_size;
+
+  inline void init( const int league_size_request
+                  , const int team_size_request 
+                  , const int vector_length_request )
+    {
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request; 
+
+      m_vector_length = vector_length_request;
+
+      set_auto_chunk_size();
+    }
+
+public:
+
+  inline int vector_length() const { return m_vector_length ; }
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
+    if(team_size_ < 0)
+      team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , int team_size_request
+            , int vector_length_request = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request , vector_length_request); }
+
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int vector_length_request = 1)
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , 256/vector_length_request , vector_length_request ); }
+
+  TeamPolicyInternal( int league_size_request
+            , int team_size_request
+            , int vector_length_request = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request , vector_length_request); }
+
+  TeamPolicyInternal( int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int vector_length_request = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , 256/vector_length_request , vector_length_request ); }
+
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int team_iter()  const { return m_team_iter ; }
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+private:
+  /** \brief finalize chunk_size if it was set to AUTO*/
+  inline void set_auto_chunk_size() {
+
+    int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
+    if( concurrency==0 ) concurrency=1;
+
+    if(m_chunk_size > 0) {
+      if(!Impl::is_integral_power_of_two( m_chunk_size ))
+        Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
+    }
+
+    int new_chunk_size = 1;
+    while(new_chunk_size*100*concurrency < m_league_size)
+      new_chunk_size *= 2;
+    if(new_chunk_size < 128) {
+      new_chunk_size = 1;
+      while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
+        new_chunk_size*=2;
+    }
+    m_chunk_size = new_chunk_size;
+  }
+
+public:
+  typedef Impl::OpenMPTargetExecTeamMember member_type ;
+};
+} // namespace Impl
+
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+inline
+int OpenMPTarget::thread_pool_size( int depth )
+{
+  //return Impl::OpenMPTargetExec::pool_size(depth);
+  return omp_get_max_threads();
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMPTarget::thread_pool_rank()
+{
+  return omp_get_thread_num();
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
+  TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
+  TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >
+  ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> PerTeam(const Impl::OpenMPTargetExecTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> PerThread(const Impl::OpenMPTargetExecTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  //result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  //init_result = loop_boundaries.thread.team_reduce(result,join);
+}
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+}
+
+#endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */
+
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@ -0,0 +1,767 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPTARGET_PARALLEL_HPP
+#define KOKKOS_OPENMPTARGET_PARALLEL_HPP
+
+#include <omp.h>
+#include <iostream>
+#include <Kokkos_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits ... >
+                 , Kokkos::Experimental::OpenMPTarget 
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ...  > Policy ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+
+
+public:
+
+  inline void execute() const {
+    execute_impl<WorkTag>();
+  }
+
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  execute_impl() const
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename Policy::member_type begin = m_policy.begin();
+      const typename Policy::member_type end = m_policy.end();
+      
+      #pragma omp target teams distribute parallel for map(to:this->m_functor)
+      for(int i=begin; i<end; i++)
+        m_functor(i);
+    }
+
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  execute_impl() const
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename Policy::member_type begin = m_policy.begin();
+      const typename Policy::member_type end = m_policy.end();
+
+      #pragma omp target teams distribute parallel for num_threads(128) map(to:this->m_functor)
+      for(int i=begin; i<end; i++)
+        m_functor(TagType(),i);
+    }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<class FunctorType, class PolicyType, class ReducerType, class PointerType, class ValueType, int FunctorHasJoin, int UseReducerType>
+struct ParallelReduceSpecialize {
+  static inline void execute(const FunctorType& f, const PolicyType& p , PointerType result_ptr) {
+    printf("Error: Invalid Specialization %i %i\n",FunctorHasJoin,UseReducerType);
+  }
+};
+
+template<class FunctorType, class ReducerType, class PointerType, class ValueType, class ... PolicyArgs>
+struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, ReducerType, PointerType, ValueType, 0,0> {
+  typedef Kokkos::RangePolicy<PolicyArgs...> PolicyType;
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename PolicyType::member_type begin = p.begin();
+      const typename PolicyType::member_type end = p.end();
+      
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(+: result)
+      for(int i=begin; i<end; i++)
+        f(i,result);
+
+      *result_ptr=result;
+    }
+
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename PolicyType::member_type begin = p.begin();
+      const typename PolicyType::member_type end = p.end();
+
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(+: result)
+      for(int i=begin; i<end; i++)
+        f(TagType(),i,result);
+      
+      *result_ptr=result;
+    }
+
+
+    inline static
+    void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
+      execute_impl<typename PolicyType::work_tag>(f,p,ptr);
+    }
+};
+/*
+template<class FunctorType, class PolicyType, class ReducerType, class PointerType, class ValueType>
+struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType, PointerType, ValueType, 0,1> {
+
+  #pragma omp declare reduction(custom: ValueType : ReducerType::join(omp_out, omp_in)) initializer ( ReducerType::init(omp_priv) )
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename PolicyType::member_type begin = p.begin();
+      const typename PolicyType::member_type end = p.end();
+
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(custom: result)
+      for(int i=begin; i<end; i++)
+        f(i,result);
+
+      *result_ptr=result;
+    }
+
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename PolicyType::member_type begin = p.begin();
+      const typename PolicyType::member_type end = p.end();
+
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(custom: result)
+      for(int i=begin; i<end; i++)
+        f(TagType(),i,result);
+
+      *result_ptr=result;
+    }
+
+
+    inline static
+    void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
+      execute_impl<typename PolicyType::work_tag>(f,p,ptr);
+    }
+};
+*/
+
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ...>
+                    , ReducerType
+                    , Kokkos::Experimental::OpenMPTarget
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  // Static Assert WorkTag void if ReducerType not InvalidType
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
+
+  enum {HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
+  enum {UseReducer = is_reducer_type<ReducerType>::value };
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  
+  typedef ParallelReduceSpecialize<FunctorType,Policy,ReducerType,pointer_type,typename ValueTraits::value_type,HasJoin,UseReducer> ParForSpecialize;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+public: 
+  inline void execute() const {
+    ParForSpecialize::execute(m_functor,m_policy,m_result_ptr);    
+  }
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ViewType    & arg_result_view
+                , typename std::enable_if<
+                           Kokkos::is_view< ViewType >::value &&
+                           !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr(  arg_result_view.data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::Experimental::OpenMPTarget
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType, WorkTag > ValueJoin ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType, WorkTag > ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+/*
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( iwork , update , final );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      const TagType t{} ;
+      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( t , iwork , update , final );
+      }
+    }
+*/
+public:
+
+  inline
+  void execute() const
+    {
+/*      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_scan");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_scan");
+
+      OpenMPTargetExec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
+
+#pragma omp parallel
+      {
+        OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
+        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        const pointer_type ptr =
+          pointer_type( exec.scratch_reduce() ) +
+          ValueTraits::value_count( m_functor );
+        ParallelScan::template exec_range< WorkTag >
+          ( m_functor , range.begin() , range.end()
+          , ValueInit::init( m_functor , ptr ) , false );
+      }
+
+      {
+        const unsigned thread_count = OpenMPTargetExec::pool_size();
+        const unsigned value_count  = ValueTraits::value_count( m_functor );
+
+        pointer_type ptr_prev = 0 ;
+
+        for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
+
+          pointer_type ptr = pointer_type( OpenMPTargetExec::pool_rev(rank_rev)->scratch_reduce() );
+
+          if ( ptr_prev ) {
+            for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
+            ValueJoin::join( m_functor , ptr + value_count , ptr );
+          }
+          else {
+            ValueInit::init( m_functor , ptr );
+          }
+
+          ptr_prev = ptr ;
+        }
+      }
+
+#pragma omp parallel
+      {
+        OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
+        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        const pointer_type ptr = pointer_type( exec.scratch_reduce() );
+        ParallelScan::template exec_range< WorkTag >
+          ( m_functor , range.begin() , range.end()
+          , ValueOps::reference( ptr ) , true );
+      }
+*/
+    }
+
+  //----------------------------------------
+
+  inline
+  ParallelScan( const FunctorType & arg_functor
+              , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+  {}
+
+  //----------------------------------------
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , Kokkos::TeamPolicy< Properties ... >
+                 , Kokkos::Experimental::OpenMPTarget
+                 >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... > Policy ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const int          m_shmem_size ;
+
+public:
+
+  inline void execute() const {
+    OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+    execute_impl<WorkTag>();
+  }
+
+private:
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  execute_impl() const
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const int league_size = m_policy.league_size();
+      const int team_size = m_policy.team_size();
+      const int vector_length = m_policy.vector_length();
+      const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
+
+      OpenMPTargetExec::resize_scratch(0,Policy::member_type::TEAM_REDUCE_SIZE,0,0);
+      void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+      #pragma omp target teams distribute parallel for num_teams(league_size) num_threads(team_size*vector_length) schedule(static,1) \
+          map(to:this->m_functor,scratch_ptr) 
+      for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
+        typename Policy::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
+        m_functor(team);
+      }
+    }
+
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  execute_impl() const
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const int league_size = m_policy.league_size();
+      const int team_size = m_policy.team_size();
+      const int vector_length = m_policy.vector_length();
+      const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
+
+      OpenMPTargetExec::resize_scratch(0,Policy::member_type::TEAM_REDUCE_SIZE,0,0);
+      void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+      #pragma omp target teams distribute parallel for num_teams(league_size) num_threads(team_size*vector_length) schedule(static,1) \
+         map(to:this->m_functor,scratch_ptr)
+      for(int i=0 ; i<league_size ; i++) {
+        typename Policy::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
+        m_functor(TagType(), team);
+      }
+    }
+
+public:
+
+  inline
+  ParallelFor( const FunctorType & arg_functor ,
+               const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    {}
+};
+
+template<class FunctorType, class ReducerType, class PointerType, class ValueType, class ... PolicyArgs>
+struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, ReducerType, PointerType, ValueType, 0,0> {
+  typedef TeamPolicyInternal<PolicyArgs...> PolicyType;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      
+      const int league_size = p.league_size();
+      const int team_size = p.team_size();
+      const int vector_length = p.vector_length();
+      const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
+      
+      OpenMPTargetExec::resize_scratch(0,PolicyType::member_type::TEAM_REDUCE_SIZE,0,0);
+      void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); 
+
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(nteams) num_threads(team_size*vector_length) \
+         map(to:f,scratch_ptr) map(tofrom:result) reduction(+: result) schedule(static,1)
+      for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
+        typename PolicyType::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
+        f(team,result);
+        if(team.m_vector_lane!=0) result = 0;
+      }
+
+      *result_ptr=result;
+    }
+
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+
+      const int league_size = p.league_size();
+      const int team_size = p.team_size();
+      const int vector_length = p.vector_length();
+      const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
+
+      OpenMPTargetExec::resize_scratch(0,PolicyType::member_type::TEAM_REDUCE_SIZE,0,0);
+      void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(nteams) num_threads(team_size*vector_length) \
+         map(to:f,scratch_ptr) map(tofrom:result) reduction(+: result) schedule(static,1)
+      for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
+        typename PolicyType::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
+        f(TagType(),team,result);
+        if(team.vector_lane!=0) result = 0;
+      }
+      *result_ptr=result;
+    }
+
+
+    inline static
+    void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
+      execute_impl<typename PolicyType::work_tag>(f,p,ptr);
+    }
+};
+
+
+template< class FunctorType , class ReducerType, class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::Experimental::OpenMPTarget
+                    >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... >         Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+
+  enum {HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
+  enum {UseReducer = is_reducer_type<ReducerType>::value };
+
+  typedef ParallelReduceSpecialize<FunctorType,Policy,ReducerType,pointer_type,typename ValueTraits::value_type,HasJoin,UseReducer> ParForSpecialize;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const ReducerType  m_reducer ;
+  const pointer_type m_result_ptr ;
+  const int          m_shmem_size ;
+
+public:
+
+  inline
+  void execute() const {
+    ParForSpecialize::execute(m_functor,m_policy,m_result_ptr);   
+  }
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const ViewType     & arg_result ,
+                  typename std::enable_if<
+                    Kokkos::is_view< ViewType >::value &&
+                    !Kokkos::is_reducer_type<ReducerType>::value
+                    ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    {}
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
+  }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+namespace Impl {
+
+  template<typename iType>
+  struct TeamThreadRangeBoundariesStruct<iType,OpenMPTargetExecTeamMember> {
+    typedef iType index_type;
+    const iType start;
+    const iType end;
+    const iType increment;
+
+    inline
+    TeamThreadRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& count):
+      start( thread_.team_rank() ),
+      end( count ),
+      increment( thread_.team_size() )
+    {}
+    inline
+    TeamThreadRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& begin_, const iType& end_):
+      start( begin_+thread_.team_rank() ),
+      end( end_ ),
+      increment( thread_.team_size() )
+    {}
+  };
+
+  template<typename iType>
+  struct ThreadVectorRangeBoundariesStruct<iType,OpenMPTargetExecTeamMember> {
+    typedef iType index_type;
+    const iType start;
+    const iType end;
+    const iType increment;
+
+    inline
+    ThreadVectorRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& count):
+      start( thread_.m_vector_lane ),
+      end( count ),
+      increment( thread_.m_vector_length )
+    {}
+    inline
+    ThreadVectorRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& begin_, const iType& end_):
+      start( begin_+thread_.m_vector_lane ),
+      end( end_ ),
+      increment( thread_.m_vector_length )
+    {}
+  };
+
+  template<typename iType>
+  KOKKOS_INLINE_FUNCTION
+  Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
+    TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
+    return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,count);
+  }
+  
+  template<typename iType>
+  KOKKOS_INLINE_FUNCTION
+  Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
+    TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
+    return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
+  }
+
+  template<typename iType>
+  KOKKOS_INLINE_FUNCTION
+  Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >
+    ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
+    return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >(thread,count);
+  }
+
+  template<typename iType>
+  KOKKOS_INLINE_FUNCTION
+  Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
+    ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
+    return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
+  }
+
+}
+
+}
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
@ -0,0 +1,329 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
+
+//----------------------------------------------------------------------------
+
+TaskExec< Kokkos::Experimental::OpenMPTarget >::
+TaskExec()
+  : m_self_exec( 0 )
+  , m_team_exec( 0 )
+  , m_sync_mask( 0 )
+  , m_sync_value( 0 )
+  , m_sync_step( 0 )
+  , m_group_rank( 0 )
+  , m_team_rank( 0 )
+  , m_team_size( 1 )
+{
+}
+
+TaskExec< Kokkos::Experimental::OpenMPTarget >::
+TaskExec( Kokkos::Impl::OpenMPTargetExec & arg_exec , int const arg_team_size )
+  : m_self_exec( & arg_exec )
+  , m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
+  , m_sync_mask( 0 )
+  , m_sync_value( 0 )
+  , m_sync_step( 0 )
+  , m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
+  , m_team_rank(  arg_exec.pool_rank_rev() % arg_team_size )
+  , m_team_size(  arg_team_size )
+{
+  // This team spans
+  //    m_self_exec->pool_rev( team_size * group_rank )
+  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
+
+  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
+
+  sync[0] = int64_t(0) ;
+  sync[1] = int64_t(0) ;
+
+  for ( int i = 0 ; i < m_team_size ; ++i ) {
+    m_sync_value |= int64_t(1) << (8*i);
+    m_sync_mask  |= int64_t(3) << (8*i);
+  }
+
+  Kokkos::memory_fence();
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void TaskExec< Kokkos::Experimental::OpenMPTarget >::team_barrier_impl() const
+{
+  if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
+    Kokkos::abort("TaskQueue<OpenMPTarget> scratch_reduce memory too small");
+  }
+
+  // Use team shared memory to synchronize.
+  // Alternate memory locations between barriers to avoid a sequence
+  // of barriers overtaking one another.
+
+  int64_t volatile * const sync =
+    ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+
+  // This team member sets one byte within the sync variable
+  int8_t volatile * const sync_self =
+   ((int8_t *) sync) + m_team_rank ;
+
+#if 0
+fprintf( stdout
+       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
+       , m_group_rank
+       , m_team_rank
+       , m_sync_step
+       , m_sync_value
+       , *sync
+       );
+fflush(stdout);
+#endif
+
+  *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
+
+  while ( m_sync_value != *sync ); // wait for team to arrive
+
+#if 0
+fprintf( stdout
+       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
+       , m_group_rank
+       , m_team_rank
+       , m_sync_step
+       , m_sync_value
+       , *sync
+       );
+fflush(stdout);
+#endif
+
+  ++m_sync_step ;
+
+  if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
+    m_sync_value ^= m_sync_mask ;
+    if ( 1000 < m_sync_step ) m_sync_step = 0 ;
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+void TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >::execute
+  ( TaskQueue< Kokkos::Experimental::OpenMPTarget > * const queue )
+{
+  using execution_space = Kokkos::Experimental::OpenMPTarget ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using PoolExec        = Kokkos::Impl::OpenMPTargetExec ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  // Required:  team_size <= 8
+
+  const int team_size = PoolExec::pool_size(2); // Threads per core
+  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+
+  if ( 8 < team_size ) {
+    Kokkos::abort("TaskQueue<OpenMPTarget> unsupported team size");
+  }
+
+#pragma omp parallel
+  {
+    PoolExec & self = *PoolExec::get_thread_omp();
+
+    Member single_exec ;
+    Member team_exec( self , team_size );
+
+    // Team shared memory
+    task_root_type * volatile * const task_shared =
+      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+
+// Barrier across entire OpenMPTarget thread pool to insure initialization
+#pragma omp barrier
+
+    // Loop until all queues are empty and no tasks in flight
+
+    do {
+
+      task_root_type * task = 0 ;
+
+      // Each team lead attempts to acquire either a thread team task
+      // or a single thread task for the team.
+
+      if ( 0 == team_exec.team_rank() ) {
+
+        task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+        // Loop by priority and then type
+        for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+          for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+            task = queue_type::pop_task( & queue->m_ready[i][j] );
+          }
+        }
+      }
+
+      // Team lead broadcast acquired task to team members:
+
+      if ( 1 < team_exec.team_size() ) {
+
+        if ( 0 == team_exec.team_rank() ) *task_shared = task ;
+
+        // Fence to be sure task_shared is stored before the barrier
+        Kokkos::memory_fence();
+
+        // Whole team waits for every team member to reach this statement
+        team_exec.team_barrier();
+
+        // Fence to be sure task_shared is stored
+        Kokkos::memory_fence();
+
+        task = *task_shared ;
+      }
+
+#if 0
+fprintf( stdout
+       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
+       , team_exec.m_group_rank
+       , team_exec.m_team_rank
+       , uintptr_t(task_shared)
+       , uintptr_t(task)
+       );
+fflush(stdout);
+#endif
+
+      if ( 0 == task ) break ; // 0 == m_ready_count
+
+      if ( end == task ) {
+        // All team members wait for whole team to reach this statement.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+      else if ( task_root_type::TaskTeam == task->m_task_type ) {
+        // Thread Team Task
+        (*task->m_apply)( task , & team_exec );
+
+        // The m_apply function performs a barrier
+
+        if ( 0 == team_exec.team_rank() ) {
+          // team member #0 completes the task, which may delete the task
+          queue->complete( task ); 
+        }
+      }
+      else {
+        // Single Thread Task
+
+        if ( 0 == team_exec.team_rank() ) {
+
+          (*task->m_apply)( task , & single_exec );
+
+          queue->complete( task ); 
+        }
+
+        // All team members wait for whole team to reach this statement.
+        // Not necessary to complete the task.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+    } while(1);
+  }
+// END #pragma omp parallel
+
+}
+
+void TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >::
+  iff_single_thread_recursive_execute
+    ( TaskQueue< Kokkos::Experimental::OpenMPTarget > * const queue )
+{
+  using execution_space = Kokkos::Experimental::OpenMPTarget ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  if ( 1 == omp_get_num_threads() ) {
+
+    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+    Member single_exec ;
+
+    task_root_type * task = end ;
+
+    do {
+
+      task = end ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+          task = queue_type::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+      if ( end == task ) break ;
+
+      (*task->m_apply)( task , & single_exec );
+
+      queue->complete( task ); 
+
+    } while(1);
+  }
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp
@ -0,0 +1,356 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
+#define KOKKOS_IMPL_OPENMP_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >
+{
+public:
+
+  using execution_space = Kokkos::Experimental::OpenMPTarget ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< Kokkos::Experimental::OpenMPTarget
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::Experimental::OpenMPTarget >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  using PoolExec = Kokkos::Impl::OpenMPTargetExec ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget > ;
+
+  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure 
+  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
+  int64_t          m_sync_mask ;
+  int64_t mutable  m_sync_value ;
+  int     mutable  m_sync_step ;
+  int              m_group_rank ; ///< Which "team" subset of thread pool
+  int              m_team_rank ;  ///< Which thread within a team
+  int              m_team_size ;
+
+  TaskExec();
+  TaskExec( PoolExec & arg_exec , int arg_team_size );
+
+  void team_barrier_impl() const ;
+
+public:
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  void * team_shared() const
+    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
+
+  int team_shared_size() const
+    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
+
+  /**\brief  Whole team enters this function call
+   *         before any teeam member returns from
+   *         this function call.
+   */
+  void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const { return m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const { return m_team_size ; }
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >
+TeamThreadRange
+  ( Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > & thread
+  , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >
+TeamThreadRange
+  ( Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > & thread
+  , const iType & start
+  , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >(thread,start,end);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+template<typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries
+  , const Lambda& lambda
+  , ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+
+    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+
+    loop_boundaries.thread.team_barrier();
+    shared[team_rank] = result;
+
+    loop_boundaries.thread.team_barrier();
+
+    // reduce across threads to thread 0
+    if (team_rank == 0) {
+      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
+        shared[0] += shared[i];
+      }
+    }
+
+    loop_boundaries.thread.team_barrier();
+
+    // broadcast result
+    initialized_result = shared[0];
+  }
+  else {
+    initialized_result = result ;
+  }
+}
+
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+
+    loop_boundaries.thread.team_barrier();
+    shared[team_rank] = result;
+
+    loop_boundaries.thread.team_barrier();
+
+    // reduce across threads to thread 0
+    if (team_rank == 0) {
+      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
+        join(shared[0], shared[i]);
+      }
+    }
+
+    loop_boundaries.thread.team_barrier();
+
+    // broadcast result
+    initialized_result = shared[0];
+  }
+  else {
+    initialized_result = result ;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+}
+
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  ValueType accum = 0 ;
+  ValueType val, local_total;
+  ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+  int team_size = loop_boundaries.thread.team_size();
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+
+  // Intra-member scan
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+
+  shared[team_rank] = accum;
+  loop_boundaries.thread.team_barrier();
+
+  // Member 0 do scan on accumulated totals
+  if (team_rank == 0) {
+    for( iType i = 1; i < team_size; i+=1) {
+      shared[i] += shared[i-1];
+    }
+    accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
+  }
+
+  loop_boundaries.thread.team_barrier();
+
+  // Inter-member scan adding in accumulated totals
+  if (team_rank != 0) { accum = shared[team_rank-1]; }
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
+   const Lambda & lambda)
+{
+}
+
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
+
--- a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
@ -41,12 +41,13 @@
 //@HEADER
 */

-#include <Kokkos_Core_fwd.hpp>
-
+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_QTHREADS )

-#include <stdio.h>
-#include <stdlib.h>
+#include <Kokkos_Core_fwd.hpp>
+
+#include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <sstream>
 #include <utility>
@ -215,6 +216,8 @@ void Qthreads::fence()
 int Qthreads::shepherd_size() const { return Impl::s_number_shepherds; }
 int Qthreads::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd; }

+const char* Qthreads::name() { return "Qthreads"; }
+
 } // namespace Kokkos

 //----------------------------------------------------------------------------
@ -516,4 +519,7 @@ QthreadsTeamPolicyMember::QthreadsTeamPolicyMember( const QthreadsTeamPolicyMemb

 } // namespace Kokkos

+#else
+void KOKKOS_SRC_QTHREADS_EXEC_PREVENT_LINK_ERROR() {}
 #endif // #if defined( KOKKOS_ENABLE_QTHREADS )
+
--- a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
@ -44,6 +44,9 @@
 #ifndef KOKKOS_QTHREADSEXEC_HPP
 #define KOKKOS_QTHREADSEXEC_HPP

+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS )
+
 #include <impl/Kokkos_spinwait.hpp>

 //----------------------------------------------------------------------------
@ -637,4 +640,6 @@ public:

 //----------------------------------------------------------------------------

+#endif
 #endif // #define KOKKOS_QTHREADSEXEC_HPP
+
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
@ -44,6 +44,9 @@
 #ifndef KOKKOS_QTHREADS_PARALLEL_HPP
 #define KOKKOS_QTHREADS_PARALLEL_HPP

+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS )
+
 #include <vector>

 #include <Kokkos_Parallel.hpp>
@ -724,4 +727,6 @@ void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& sing

 } // namespace Kokkos

+#endif
 #endif /* #define KOKKOS_QTHREADS_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
@ -41,10 +41,10 @@
 //@HEADER
 */

-#include <Kokkos_Core.hpp>
-
+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )

+#include <Kokkos_Core.hpp>
 #include <impl/Kokkos_TaskQueue_impl.hpp>

 //----------------------------------------------------------------------------
@ -314,7 +314,7 @@ void TaskQueueSpecialization< Kokkos::Qthreads >::
 }} /* namespace Kokkos::Impl */

 //----------------------------------------------------------------------------
-
+#else
+void KOKKOS_SRC_QTHREADS_TASK_PREVENT_LINK_ERROR() {}
 #endif /* #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

-
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
@ -44,7 +44,8 @@
 #ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP
 #define KOKKOS_IMPL_QTHREADS_TASK_HPP

-#if defined( KOKKOS_ENABLE_TASKPOLICY )
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,20 +36,22 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

 // Experimental unified task-data parallel manycore LDRD.

-#include <Kokkos_Core_fwd.hpp>

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_QTHREADS )

-#include <stdio.h>
+#include <Kokkos_Core_fwd.hpp>
+
+#include <cstdio>
+#include <cstdlib>

-#include <stdlib.h>
 #include <stdexcept>
 #include <iostream>
 #include <sstream>
@ -448,7 +450,7 @@ TaskPolicy< Kokkos::Qthreads >::
 TaskPolicy
  ( const unsigned /* arg_task_max_count */
  , const unsigned /* arg_task_max_size */
-  , const unsigned arg_task_default_dependence_capacity 
+  , const unsigned arg_task_default_dependence_capacity
  , const unsigned arg_task_team_size
  )
  : m_default_dependence_capacity( arg_task_default_dependence_capacity )
@ -484,5 +486,8 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads > & policy )
 } // namespace Experimental
 } // namespace Kokkos

+#else
+void KOKKOS_CORE_SRC_QTHREADS_KOKKOS_QTHREADS_TASKPOLICY_PREVENT_LINK_ERROR() {}
 #endif // #if defined( KOKKOS_ENABLE_TASKDAG )
 #endif // #if defined( KOKKOS_ENABLE_QTHREADS )
+
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -46,6 +46,9 @@
 #ifndef KOKKOS_QTHREADS_TASKSCHEDULER_HPP
 #define KOKKOS_QTHREADS_TASKSCHEDULER_HPP

+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
 #include <string>
 #include <typeinfo>
 #include <stdexcept>
@ -69,7 +72,6 @@

 #include <impl/Kokkos_FunctorAdapter.hpp>

-#if defined( KOKKOS_ENABLE_TASKDAG )

 //----------------------------------------------------------------------------

@ -505,7 +507,7 @@ public:

  template< class ValueType >
  const Future< ValueType , execution_space > &
-    spawn( const Future< ValueType , execution_space > & f 
+    spawn( const Future< ValueType , execution_space > & f
         , const bool priority = false ) const
      {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
@ -632,7 +634,7 @@ public:
    }

  template< class FunctorType >
-  void respawn( FunctorType * task_functor 
+  void respawn( FunctorType * task_functor
              , const bool priority = false ) const
    {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
@ -41,7 +41,11 @@
 //@HEADER
 */

-#if defined( KOKKOS_ENABLE_TASKPOLICY )
+#ifndef KOKKOS_QTHREADS_TASKQUEUE_HPP
+#define KOKKOS_QTHREADS_TASKQUEUE_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -60,7 +64,7 @@ private:
  using execution_space = Kokkos::Qthread ;
  using memory_space    = Kokkos::HostSpace
  using device_type     = Kokkos::Device< execution_space, memory_space > ;
-  using memory_pool     = Kokkos::Experimental::MemoryPool< device_type > ;
+  using memory_pool     = Kokkos::MemoryPool< device_type > ;
  using task_root_type  = Kokkos::Impl::TaskBase< execution_space, void, void > ;

  friend class Kokkos::TaskScheduler< execution_space > ;
@ -317,3 +321,5 @@ public:
 //----------------------------------------------------------------------------

 #endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif // KOKKOS_QTHREADS_TASKQUEUE_HPP
+
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
@ -41,7 +41,11 @@
 //@HEADER
 */

-#if defined( KOKKOS_ENABLE_TASKPOLICY )
+#ifndef KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
+#define KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )

 namespace Kokkos {
 namespace Impl {
@ -433,4 +437,5 @@ fflush(stdout);


 #endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif // KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP

--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@ -41,11 +41,13 @@
 //@HEADER
 */

+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )
+
 #include <Kokkos_Core_fwd.hpp>

-#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD )
-
-#include <stdint.h>
+#include <cstdint>
 #include <limits>
 #include <utility>
 #include <iostream>
@ -512,8 +514,8 @@ void ThreadsExec::print_configuration( std::ostream & s , const bool detail )

  s << "Kokkos::Threads" ;

-#if defined( KOKKOS_ENABLE_PTHREAD )
-  s << " KOKKOS_ENABLE_PTHREAD" ;
+#if defined( KOKKOS_ENABLE_THREADS )
+  s << " KOKKOS_ENABLE_THREADS" ;
 #endif
 #if defined( KOKKOS_ENABLE_HWLOC )
  s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
@ -817,10 +819,12 @@ int Threads::thread_pool_rank()
 }
 #endif

+const char* Threads::name() { return "Threads"; }
 } /* namespace Kokkos */

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
-
-#endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD ) */
+#else
+void KOKKOS_CORE_SRC_THREADS_EXEC_PREVENT_LINK_ERROR() {}
+#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */

--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@ -44,7 +44,10 @@
 #ifndef KOKKOS_THREADSEXEC_HPP
 #define KOKKOS_THREADSEXEC_HPP

-#include <stdio.h>
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )
+
+#include <cstdio>

 #include <utility>
 #include <impl/Kokkos_spinwait.hpp>
@ -626,6 +629,6 @@ inline void Threads::fence()

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
-
+#endif
 #endif /* #define KOKKOS_THREADSEXEC_HPP */

--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,18 +36,16 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

-#include <Kokkos_Core_fwd.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_PTHREAD )

+#include <Kokkos_Core_fwd.hpp>
 /* Standard 'C' Linux libraries */

 #include <pthread.h>
@ -154,6 +152,8 @@ void ThreadsExec::wait_yield( volatile int & flag , const int value )

 #elif defined( KOKKOS_ENABLE_WINTHREAD )

+#include <Kokkos_Core_fwd.hpp>
+
 /* Windows libraries */
 #include <winsock2.h>
 #include <windows.h>
@ -247,9 +247,7 @@ void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
 } // namespace Impl
 } // namespace Kokkos

+#else
+void KOKKOS_CORE_SRC_THREADS_EXEC_BASE_PREVENT_LINK_ERROR() {}
 #endif /* end #elif defined( KOKKOS_ENABLE_WINTHREAD ) */
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-

--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@ -44,7 +44,10 @@
 #ifndef KOKKOS_THREADSTEAM_HPP
 #define KOKKOS_THREADSTEAM_HPP

-#include <stdio.h>
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )
+
+#include <cstdio>

 #include <utility>
 #include <impl/Kokkos_spinwait.hpp>
@ -173,7 +176,9 @@ public:
  }

  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !Kokkos::is_reducer< Type >::value , Type>::type
+  team_reduce( const Type & value ) const
 #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
    { return Type(); }
 #else
@ -202,6 +207,55 @@ public:
    }
 #endif

+    template< typename ReducerType >
+    KOKKOS_INLINE_FUNCTION
+    typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    team_reduce( const ReducerType & ) const
+      {}
+  #else
+    team_reduce( const ReducerType & reducer ) const
+    {
+      typedef typename ReducerType::value_type value_type;
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      if ( 0 == m_exec ) return ;
+
+      type * const local_value = ((type*) m_exec->scratch_memory());
+
+      // Set this thread's contribution
+      *local_value = reducer.reference() ;
+
+      // Fence to make sure the base team member has access:
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        type * const team_value = ((type*) m_team_base[0]->scratch_memory());
+
+        // Join to the team value:
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          reducer.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
+        }
+
+        // Team base thread may "lap" member threads so copy out to their local value.
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          *((type*) m_team_base[i]->scratch_memory()) = *team_value ;
+        }
+
+        // Fence to make sure all team members have access
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      // Value was changed by the team base
+      reducer.reference() = *((type volatile const *) local_value);
+    }
+  #endif
+
  template< class ValueType, class JoinOp >
  KOKKOS_INLINE_FUNCTION ValueType
    team_reduce( const ValueType & value
@ -760,7 +814,8 @@ void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Thread
 * val is performed and put into result. This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+typename std::enable_if< !Kokkos::is_reducer< ValueType >::value >::type
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
                     const Lambda & lambda, ValueType& result) {

  result = ValueType();
@ -774,6 +829,21 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Thr
  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
 }

+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const ReducerType& reducer) {
+
+  reducer.init(reducer.reference());
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+
+  loop_boundaries.thread.team_reduce(reducer);
+}
+
 /** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
 *
 * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
@ -822,17 +892,26 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Thre
 * val is performed and put into result. This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+typename std::enable_if< !Kokkos::is_reducer< ValueType >::value >::type
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
      loop_boundaries, const Lambda & lambda, ValueType& result) {
  result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }
 }

+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const ReducerType& reducer) {
+  reducer.init(reducer.reference());
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+}
+
 /** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
 *
 * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
@ -915,6 +994,6 @@ void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
-
+#endif
 #endif /* #define KOKKOS_THREADSTEAM_HPP */

--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -44,8 +44,11 @@
 #ifndef KOKKOS_THREADS_PARALLEL_HPP
 #define KOKKOS_THREADS_PARALLEL_HPP

+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )
+
 #include <vector>
-#include <iostream> 
+#include <iostream>

 #include <Kokkos_Parallel.hpp>

@ -83,7 +86,7 @@ private:
  exec_range( const FunctorType & functor
            , const Member ibeg , const Member iend )
    {
-      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
      #pragma ivdep
      #endif
@ -99,7 +102,7 @@ private:
            , const Member ibeg , const Member iend )
    {
      const TagType t{} ;
-      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
      #pragma ivdep
      #endif
@ -308,7 +311,7 @@ private:
            , const Member & ibeg , const Member & iend
            , reference_type update )
    {
-      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
      #pragma ivdep
      #endif
@ -325,7 +328,7 @@ private:
            , reference_type update )
    {
      const TagType t{} ;
-      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
      #pragma ivdep
      #endif
@ -348,7 +351,7 @@ private:
    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );

    ParallelReduce::template exec_range< WorkTag >
-      ( self.m_functor , range.begin() , range.end() 
+      ( self.m_functor , range.begin() , range.end()
      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );

    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
@ -428,7 +431,7 @@ public:
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( reducer )
-    , m_result_ptr(  reducer.result_view().data() )
+    , m_result_ptr(  reducer.view().data() )
    {
      /*static_assert( std::is_same< typename ViewType::memory_space
                                      , Kokkos::HostSpace >::value
@ -543,7 +546,7 @@ public:
  : m_functor( arg_functor )
  , m_policy(  arg_policy )
  , m_reducer( reducer )
-  , m_result_ptr(  reducer.result_view().data() )
+  , m_result_ptr(  reducer.view().data() )
  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
  {
  /*static_assert( std::is_same< typename ViewType::memory_space
@ -584,7 +587,7 @@ private:
            , const Member & ibeg , const Member & iend
            , reference_type update , const bool final )
    {
-      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
      #pragma ivdep
      #endif
@ -601,7 +604,7 @@ private:
            , reference_type update , const bool final )
    {
      const TagType t{} ;
-      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
      #pragma ivdep
      #endif
@ -654,5 +657,6 @@ public:
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

+#endif
 #endif /* #define KOKKOS_THREADS_PARALLEL_HPP */

--- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@ -44,24 +44,20 @@
 #ifndef KOKKOS_HOST_EXP_ITERATE_TILE_HPP
 #define KOKKOS_HOST_EXP_ITERATE_TILE_HPP

-#include <iostream>
-#include <algorithm>
-#include <stdio.h>
-
 #include <Kokkos_Macros.hpp>
-
-#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
 #define KOKKOS_MDRANGE_IVDEP
 #endif

-
 #ifdef KOKKOS_MDRANGE_IVDEP
 #define KOKKOS_ENABLE_IVDEP_MDRANGE _Pragma("ivdep")
 #else
 #define KOKKOS_ENABLE_IVDEP_MDRANGE
 #endif

-
+#include <iostream>
+#include <algorithm>
+#include <cstdio>

 namespace Kokkos { namespace Experimental { namespace Impl {

@ -243,7 +239,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
    APPLY( func, i0 + m_offset[0] )              \
-  } 
+  }

 #define LOOP_LAYOUT_2( func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -255,7 +251,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
      LOOP_R_1( func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_3( func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -267,7 +263,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
      LOOP_R_2( func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_4( func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -279,7 +275,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
      LOOP_R_3( func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_5( func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -291,7 +287,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
      LOOP_R_4( func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_6( func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -303,7 +299,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
      LOOP_R_5( func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_7( func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -315,7 +311,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
      LOOP_R_6( func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_8( func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -327,7 +323,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
      LOOP_R_7( func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
    } \
-  } 
+  }

 // Partial vs Full Tile
 #define TILE_LOOP_1( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
@ -459,7 +455,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
    APPLY_REDUX( val, func, i0 + m_offset[0] )              \
-  } 
+  }

 #define LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -471,7 +467,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
      LOOP_R_1_REDUX( val, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -483,7 +479,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
      LOOP_R_2_REDUX( val, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -495,7 +491,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
      LOOP_R_3_REDUX( val, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -507,7 +503,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
      LOOP_R_4_REDUX( val, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -519,7 +515,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
      LOOP_R_5_REDUX( val, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -531,7 +527,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
      LOOP_R_6_REDUX( val, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
    } \
-  } 
+  }

 #define LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -543,7 +539,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
      LOOP_R_7_REDUX( val, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
    } \
-  } 
+  }

 // Partial vs Full Tile
 #define TILE_LOOP_1_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
@ -676,7 +672,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
    TAGGED_APPLY( tag, func, i0 + m_offset[0] )              \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -688,7 +684,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
      TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -700,7 +696,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
      TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -712,7 +708,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
      TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -724,7 +720,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
      TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -736,7 +732,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
      TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -748,7 +744,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
      TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -760,7 +756,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
      TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
    } \
-  } 
+  }

 // Partial vs Full Tile
 #define TAGGED_TILE_LOOP_1( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
@ -892,7 +888,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
    TAGGED_APPLY_REDUX( val, tag, func, i0 + m_offset[0] )              \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -904,7 +900,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
      TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -916,7 +912,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
      TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -928,7 +924,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
      TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -940,7 +936,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
      TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -952,7 +948,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
      TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -964,7 +960,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
      TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
    } \
-  } 
+  }

 #define TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
  if (is_left) { \
@ -976,7 +972,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
      TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
    } \
-  } 
+  }

 // Partial vs Full Tile
 #define TAGGED_TILE_LOOP_1_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
@ -1321,8 +1317,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
        }
        else {
          is_full_tile = false ;
-            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 
-                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) 
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
        }
      }
@ -1332,7 +1328,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i


  template <int Rank>
-  struct RankTag 
+  struct RankTag
  {
    typedef RankTag type;
    enum { value = (int)Rank };
@ -1343,13 +1339,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
  inline
  void
  operator()(IType tile_idx) const
-  { 
+  {
    point_type m_offset;
    point_type m_tiledims;

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1361,13 +1357,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
    }

    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
-    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;

    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );

  }

-#else 
+#else
  template <typename IType>
  inline
  void
@ -1384,7 +1380,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1396,7 +1392,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
    }

    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
-    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;

    if (RP::inner_direction == RP::Left) {
     if ( full_tile ) {
@ -1437,7 +1433,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1490,7 +1486,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1543,7 +1539,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1596,7 +1592,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1649,7 +1645,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1702,7 +1698,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1804,8 +1800,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
        }
        else {
          is_full_tile = false ;
-            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 
-                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) 
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
        }
      }
@ -1815,7 +1811,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i


  template <int Rank>
-  struct RankTag 
+  struct RankTag
  {
    typedef RankTag type;
    enum { value = (int)Rank };
@ -1827,13 +1823,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
  inline
  void
  operator()(IType tile_idx) const
-  { 
+  {
    point_type m_offset;
    point_type m_tiledims;

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1845,13 +1841,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
    }

    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
-    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;

    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );

  }

-#else 
+#else
  template <typename IType>
  inline
  void
@ -1869,7 +1865,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1881,7 +1877,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
    }

    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
-    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;

    if (RP::inner_direction == RP::Left) {
     if ( full_tile ) {
@ -1922,7 +1918,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -1975,7 +1971,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -2028,7 +2024,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -2081,7 +2077,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -2134,7 +2130,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -2187,7 +2183,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i

    if (RP::outer_direction == RP::Left) {
      for (int i=0; i<RP::rank; ++i) {
-        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
        tile_idx /= m_rp.m_tile_end[i];
      }
    }
@ -2352,5 +2348,5 @@ struct MDFunctor< MDRange, Functor, void >

 } } } //end namespace Kokkos::Experimental::Impl

-
 #endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@ -195,3 +195,4 @@ struct PolicyTraits


 #endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,10 +36,12 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
+
+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_HPP )
 #define KOKKOS_ATOMIC_ASSEMBLY_HPP
 namespace Kokkos {
@ -110,3 +112,4 @@ namespace Impl {
 }

 #endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@ -41,6 +41,7 @@
 //@HEADER
 */

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP

--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
@ -41,6 +41,7 @@
 //@HEADER
 */

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP )
 #define KOKKOS_ATOMIC_DECREMENT_HPP

@ -117,3 +118,4 @@ void atomic_decrement(volatile T* a) {

 } // End of namespace Kokkos
 #endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@ -41,6 +41,7 @@
 //@HEADER
 */

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
 #define KOKKOS_ATOMIC_EXCHANGE_HPP

@ -356,5 +357,3 @@ void atomic_assign( volatile T * const dest , const T val )

 #endif

-//----------------------------------------------------------------------------
-
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@ -41,6 +41,7 @@
 //@HEADER
 */

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
 #define KOKKOS_ATOMIC_FETCH_ADD_HPP

@ -279,6 +280,7 @@ T atomic_fetch_add( volatile T * const dest ,
 {
  while( !Impl::lock_address_host_space( (void*) dest ) );
  T return_val = *dest;
+
  // Don't use the following line of code here:
  //
  //const T tmp = *dest = return_val + val;
@ -292,6 +294,7 @@ T atomic_fetch_add( volatile T * const dest ,
  const T tmp = *dest;
  (void) tmp;
  Impl::unlock_address_host_space( (void*) dest );
+
  return return_val;
 }
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,11 +36,12 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
 #define KOKKOS_ATOMIC_FETCH_AND_HPP

@ -124,4 +125,3 @@ void atomic_and(volatile T * const dest, const T src) {

 #endif

-
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,11 +36,12 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
 #define KOKKOS_ATOMIC_FETCH_OR_HPP

@ -124,4 +125,3 @@ void atomic_or(volatile T * const dest, const T src) {

 #endif

-
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,11 +36,12 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
 #define KOKKOS_ATOMIC_FETCH_SUB_HPP

@ -238,4 +239,3 @@ void atomic_sub(volatile T * const dest, const T src) {
 #include<impl/Kokkos_Atomic_Assembly.hpp>
 #endif

-
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,10 +36,12 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
+
+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
 #define KOKKOS_ATOMIC_GENERIC_HPP
 #include <Kokkos_Macros.hpp>
@ -424,6 +426,6 @@ T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
  return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
 }

-
-}
+} // namespace Kokkos
 #endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
@ -41,6 +41,7 @@
 //@HEADER
 */

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP )
 #define KOKKOS_ATOMIC_INCREMENT_HPP

@ -115,3 +116,4 @@ void atomic_increment(volatile T* a) {

 } // End of namespace Kokkos
 #endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -428,3 +428,4 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
 }} // namespace Kokkos::Impl

 #endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,12 +36,13 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 #ifndef KOKKOS_ATOMIC_WINDOWS_HPP
 #define KOKKOS_ATOMIC_WINDOWS_HPP
+
 #ifdef _WIN32

 #define NOMINMAX
--- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@ -45,12 +45,38 @@
 #define KOKKOS_BITOPS_HPP

 #include <Kokkos_Macros.hpp>
-#include <stdint.h>
+#include <cstdint>
 #include <climits>

 namespace Kokkos {
 namespace Impl {

+/**\brief  Find first zero bit.
+ *
+ *  If none then return -1 ;
+ */
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_first_zero( unsigned i ) noexcept
+{
+  enum : unsigned { full = ~0u };
+
+#if defined( __CUDA_ARCH__ )
+  return full != i ? __ffs( ~i ) - 1 : -1 ;
+#elif defined( KOKKOS_COMPILER_INTEL )
+  return full != i ? _bit_scan_forward( ~i ) : -1 ;
+#elif defined( KOKKOS_COMPILER_IBM )
+  return full != i ? __cnttz4( ~i ) : -1 ;
+#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
+  return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
+#else
+  int offset = -1 ;
+  if ( full != i ) {
+    for ( offset = 0 ; i & ( 1 << offset ) ; ++offset );
+  }
+  return offset ;
+#endif
+}
+
 KOKKOS_FORCEINLINE_FUNCTION
 int bit_scan_forward( unsigned i )
 {
@ -121,7 +147,16 @@ int bit_count( unsigned i )
 #endif
 }

+KOKKOS_INLINE_FUNCTION
+unsigned integral_power_of_two_that_contains( const unsigned N )
+{
+  const unsigned i = Kokkos::Impl::bit_scan_reverse( N );
+  return ( (1u << i) < N ) ? i + 1 : i ;
+}
+
+
 } // namespace Impl
 } // namespace Kokkos

 #endif // KOKKOS_BITOPS_HPP
+
--- a/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
@ -122,3 +122,4 @@ int mpi_local_rank_on_node() {

 }
 }
+
--- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@ -0,0 +1,106 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CLOCKTIC_HPP
+#define KOKKOS_CLOCKTIC_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+#include <chrono>
+
+namespace Kokkos {
+namespace Impl {
+
+/**\brief  Quick query of clock register tics
+ *
+ *  Primary use case is to, with low overhead,
+ *  obtain a integral value that consistently varies
+ *  across concurrent threads of execution within
+ *  a parallel algorithm.
+ *  This value is often used to "randomly" seed an
+ *  attempt to acquire an indexed resource (e.g., bit)
+ *  from an array of resources (e.g., bitset) such that
+ *  concurrent threads will have high likelihood of
+ *  having different index-seed values.
+ */
+KOKKOS_FORCEINLINE_FUNCTION
+uint64_t clock_tic(void) noexcept
+{
+#if defined( __CUDA_ARCH__ )
+
+  // Return value of 64-bit hi-res clock register.
+
+  return clock64();
+
+#elif defined( __i386__ ) || defined( __x86_64 )
+
+  // Return value of 64-bit hi-res clock register.
+
+  unsigned a = 0, d = 0;
+
+  __asm__ volatile( "rdtsc" : "=a" (a), "=d" (d) );
+
+  return ( (uint64_t) a ) | ( ( (uint64_t) d ) << 32 );
+
+#elif defined( __powerpc )     || defined( __powerpc__ ) || \
+      defined( __powerpc64__ ) || defined( __POWERPC__ ) || \
+      defined( __ppc__ )       || defined( __ppc64__ )
+
+  unsigned int cycles = 0;
+
+  asm volatile( "mftb %0" : "=r" (cycles) );
+
+  return (uint64_t) cycles;
+
+#else
+
+  return (uint64_t)
+    std::chrono::high_resolution_clock::now().time_since_epoch().count();
+
+#endif
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif // KOKKOS_CLOCKTIC_HPP
--- a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
@ -0,0 +1,357 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CONCURRENTBITSET_HPP
+#define KOKKOS_CONCURRENTBITSET_HPP
+
+#include <stdint.h>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+#include <impl/Kokkos_ClockTic.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+struct concurrent_bitset {
+public:
+
+  // 32 bits per integer value
+
+  enum : uint32_t { bits_per_int_lg2  = 5 };
+  enum : uint32_t { bits_per_int_mask = ( 1 << bits_per_int_lg2 ) - 1 };
+
+  // Buffer is uint32_t[ buffer_bound ]
+  //   [ uint32_t { state_header | used_count } , uint32_t bits[*] ]
+  //
+  //  Maximum bit count is 33 million (1u<<25):
+  //
+  //  - Maximum bit set size occupies 1 Mbyte
+  //
+  //  - State header can occupy bits [30-26]
+  //    which can be the bit_count_lg2
+  //
+  //  - Accept at least 33 million concurrent calls to 'acquire'
+  //    before risking an overflow race condition on a full bitset.
+
+  enum : uint32_t { max_bit_count_lg2 = 25 };
+  enum : uint32_t { max_bit_count     = 1u << max_bit_count_lg2 };
+  enum : uint32_t { state_shift = 26 };
+  enum : uint32_t { state_used_mask   = ( 1 << state_shift ) - 1 };
+  enum : uint32_t { state_header_mask = uint32_t(0x001f) << state_shift };
+
+  KOKKOS_INLINE_FUNCTION static constexpr
+  uint32_t buffer_bound_lg2( uint32_t const bit_bound_lg2 ) noexcept
+    {
+      return bit_bound_lg2 <= max_bit_count_lg2
+           ? 1 + ( 1u << ( bit_bound_lg2 > bits_per_int_lg2
+                         ? bit_bound_lg2 - bits_per_int_lg2 : 0 ) )
+           : 0 ;
+    }
+
+  /**\brief  Initialize bitset buffer */
+  KOKKOS_INLINE_FUNCTION static constexpr
+  uint32_t buffer_bound( uint32_t const bit_bound ) noexcept
+    {
+      return bit_bound <= max_bit_count
+           ? 1 + ( bit_bound >> bits_per_int_lg2 ) +
+             ( bit_bound & bits_per_int_mask ? 1 : 0 )
+           : 0 ;
+    }
+
+  /**\brief  Claim any bit within the bitset bound.
+   *
+   *  Return : ( which_bit , bit_count )
+   *
+   *  if success then
+   *    bit_count is the atomic-count of claimed > 0
+   *    which_bit is the claimed bit >= 0
+   *  else if attempt failed due to filled buffer
+   *    bit_count == which_bit == -1
+   *  else if attempt failed due to non-matching state_header
+   *    bit_count == which_bit == -2
+   *  else if attempt failed due to max_bit_count_lg2 < bit_bound_lg2
+   *                             or invalid state_header
+   *                             or (1u << bit_bound_lg2) <= bit
+   *    bit_count == which_bit == -3
+   *  endif
+   *
+   *  Recommended to have hint
+   *    bit = Kokkos::Impl::clock_tic() & ((1u<<bit_bound_lg2) - 1)
+   */
+  KOKKOS_INLINE_FUNCTION static
+  Kokkos::pair<int,int>
+  acquire_bounded_lg2( uint32_t volatile * const buffer
+                     , uint32_t const bit_bound_lg2
+                     , uint32_t bit = 0                /* optional hint */
+                     , uint32_t const state_header = 0 /* optional header */
+                     ) noexcept
+    {
+      typedef Kokkos::pair<int,int> type ;
+
+      const uint32_t bit_bound  = 1 << bit_bound_lg2 ;
+      const uint32_t word_count = bit_bound >> bits_per_int_lg2 ;
+
+      if ( ( max_bit_count_lg2 < bit_bound_lg2 ) ||
+           ( state_header & ~state_header_mask ) ||
+           ( bit_bound < bit ) ) {
+        return type(-3,-3);
+      }
+
+      // Use potentially two fetch_add to avoid CAS loop.
+      // Could generate "racing" failure-to-acquire
+      // when is full at the atomic_fetch_add(+1)
+      // then a release occurs before the atomic_fetch_add(-1).
+
+      const uint32_t state = (uint32_t)
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , 1 );
+
+      const uint32_t state_error =
+        state_header != ( state & state_header_mask );
+
+      const uint32_t state_bit_used = state & state_used_mask ;
+
+      if ( state_error || ( bit_bound <= state_bit_used ) ) {
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
+        return state_error ? type(-2,-2) : type(-1,-1);
+      }
+
+      // Do not update bit until count is visible:
+
+      Kokkos::memory_fence();
+
+      // There is a zero bit available somewhere,
+      // now find the (first) available bit and set it.
+
+      while(1) {
+
+        const uint32_t word = bit >> bits_per_int_lg2 ;
+        const uint32_t mask = 1u << ( bit & bits_per_int_mask );
+        const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask);
+
+        if ( ! ( prev & mask ) ) {
+          // Successfully claimed 'result.first' by
+          // atomically setting that bit.
+          return type( bit , state_bit_used + 1 );
+        }
+
+        // Failed race to set the selected bit
+        // Find a new bit to try.
+
+        const int j = Kokkos::Impl::bit_first_zero( prev );
+
+        if ( 0 <= j ) {
+          bit = ( word << bits_per_int_lg2 ) | uint32_t(j);
+        }
+        else {
+          bit =
+            ( (word+1) < word_count ? ((word+1) << bits_per_int_lg2) : 0 )
+            | ( bit & bits_per_int_mask );
+        }
+      }
+    }
+
+  /**\brief  Claim any bit within the bitset bound.
+   *
+   *  Return : ( which_bit , bit_count )
+   *
+   *  if success then
+   *    bit_count is the atomic-count of claimed > 0
+   *    which_bit is the claimed bit >= 0
+   *  else if attempt failed due to filled buffer
+   *    bit_count == which_bit == -1
+   *  else if attempt failed due to non-matching state_header
+   *    bit_count == which_bit == -2
+   *  else if attempt failed due to max_bit_count_lg2 < bit_bound_lg2
+   *                             or invalid state_header
+   *                             or bit_bound <= bit
+   *    bit_count == which_bit == -3
+   *  endif
+   *
+   *  Recommended to have hint
+   *    bit = Kokkos::Impl::clock_tic() % bit_bound
+   */
+  KOKKOS_INLINE_FUNCTION static
+  Kokkos::pair<int,int>
+  acquire_bounded( uint32_t volatile * const buffer
+                 , uint32_t const bit_bound
+                 , uint32_t bit = 0                /* optional hint */
+                 , uint32_t const state_header = 0 /* optional header */
+                 ) noexcept
+    {
+      typedef Kokkos::pair<int,int> type ;
+
+      if ( ( max_bit_count < bit_bound ) ||
+           ( state_header & ~state_header_mask ) ||
+           ( bit_bound <= bit ) ) {
+        return type(-3,-3);
+      }
+
+      const uint32_t word_count = bit_bound >> bits_per_int_lg2 ;
+
+      // Use potentially two fetch_add to avoid CAS loop.
+      // Could generate "racing" failure-to-acquire
+      // when is full at the atomic_fetch_add(+1)
+      // then a release occurs before the atomic_fetch_add(-1).
+
+      const uint32_t state = (uint32_t)
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , 1 );
+
+      const uint32_t state_error =
+        state_header != ( state & state_header_mask );
+
+      const uint32_t state_bit_used = state & state_used_mask ;
+
+      if ( state_error || ( bit_bound <= state_bit_used ) ) {
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
+        return state_error ? type(-2,-2) : type(-1,-1);
+      }
+
+      // Do not update bit until count is visible:
+
+      Kokkos::memory_fence();
+
+      // There is a zero bit available somewhere,
+      // now find the (first) available bit and set it.
+
+      while(1) {
+
+        const uint32_t word = bit >> bits_per_int_lg2 ;
+        const uint32_t mask = 1u << ( bit & bits_per_int_mask );
+        const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask);
+
+        if ( ! ( prev & mask ) ) {
+          // Successfully claimed 'result.first' by
+          // atomically setting that bit.
+          return type( bit , state_bit_used + 1 );
+        }
+
+        // Failed race to set the selected bit
+        // Find a new bit to try.
+
+        const int j = Kokkos::Impl::bit_first_zero( prev );
+
+        if ( 0 <= j ) {
+          bit = (word << bits_per_int_lg2 ) | uint32_t(j);
+        }
+
+        if ( ( j < 0 ) || ( bit_bound <= bit ) ) {
+          bit =
+            ( (word+1) < word_count ? ((word+1) << bits_per_int_lg2) : 0 )
+            | ( bit & bits_per_int_mask );
+        }
+      }
+    }
+
+  /**\brief
+   *
+   *  Requires: 'bit' previously acquired and has not yet been released.
+   *
+   *  Returns:
+   *    0 <= used count after successful release
+   *    -1 bit was already released
+   *    -2 state_header error
+   */
+  KOKKOS_INLINE_FUNCTION static
+  int release( uint32_t volatile * const buffer
+             , uint32_t const bit
+             , uint32_t const state_header = 0 /* optional header */
+             ) noexcept
+    {
+      if ( state_header != ( state_header_mask & *buffer ) ) { return -2 ; }
+
+      const uint32_t mask = 1u << ( bit & bits_per_int_mask );
+      const uint32_t prev =
+        Kokkos::atomic_fetch_and( buffer + ( bit >> bits_per_int_lg2 ) + 1
+                                , ~mask
+                                );
+
+      if ( ! ( prev & mask ) ) { return -1 ; }
+
+      // Do not update count until bit clear is visible
+      Kokkos::memory_fence();
+
+      const int count =
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
+
+      return ( count & state_used_mask ) - 1 ;
+    }
+
+  /**\brief
+   *
+   *  Requires: Bit within bounds and not already set.
+   *
+   *  Returns:
+   *    0 <= used count after successful release
+   *    -1 bit was already released
+   *    -2 bit or state_header error
+   */
+  KOKKOS_INLINE_FUNCTION static
+  int set( uint32_t volatile * const buffer
+         , uint32_t const bit
+         , uint32_t const state_header = 0 /* optional header */
+         ) noexcept
+    {
+      if ( state_header != ( state_header_mask & *buffer ) ) { return -2 ; }
+
+      const uint32_t mask = 1u << ( bit & bits_per_int_mask );
+      const uint32_t prev =
+        Kokkos::atomic_fetch_or( buffer + ( bit >> bits_per_int_lg2 ) + 1
+                               , mask
+                               );
+
+      if ( ! ( prev & mask ) ) { return -1 ; }
+
+      // Do not update count until bit clear is visible
+      Kokkos::memory_fence();
+
+      const int count =
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
+
+      return ( count & state_used_mask ) - 1 ;
+    }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif /* #ifndef KOKKOS_CONCURRENTBITSET_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -46,6 +46,7 @@
 #include <cctype>
 #include <cstring>
 #include <iostream>
+#include <sstream>
 #include <cstdlib>

 //----------------------------------------------------------------------------
@ -75,10 +76,10 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
 #endif

  // Protect declarations, to prevent "unused variable" warnings.
-#if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_PTHREAD )
+#if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMPTARGET )
  const int num_threads = args.num_threads;
  const int use_numa = args.num_numa;
-#endif // defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_PTHREAD )
+#endif // defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS )
 #if defined( KOKKOS_ENABLE_CUDA )
  const int use_gpu = args.device_id;
 #endif // defined( KOKKOS_ENABLE_CUDA )
@ -103,7 +104,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
  }
 #endif

-#if defined( KOKKOS_ENABLE_PTHREAD )
+#if defined( KOKKOS_ENABLE_THREADS )
  if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
      std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
    if(num_threads>0) {
@ -135,6 +136,25 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
  }
 #endif

+#if defined( KOKKOS_ENABLE_OPENMPTARGET )
+  if( Impl::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value ) {
+    if(num_threads>0) {
+      if(use_numa>0) {
+        Kokkos::Experimental::OpenMPTarget::initialize(num_threads,use_numa);
+      }
+      else {
+        Kokkos::Experimental::OpenMPTarget::initialize(num_threads);
+      }
+    } else {
+      Kokkos::Experimental::OpenMPTarget::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
+  }
+  else {
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
+  }
+#endif
+
 #if defined( KOKKOS_ENABLE_CUDA )
  if( std::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
    if (use_gpu > -1) {
@ -166,6 +186,13 @@ void finalize_internal( const bool all_spaces = false )
  }
 #endif

+#if defined( KOKKOS_ENABLE_OPENMPTARGET )
+  if( std::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
+    if(Kokkos::Experimental::OpenMPTarget::is_initialized())
+      Kokkos::Experimental::OpenMPTarget::finalize();
+  }
+#endif
+
 #if defined( KOKKOS_ENABLE_OPENMP )
  if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
      std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ||
@ -175,7 +202,7 @@ void finalize_internal( const bool all_spaces = false )
  }
 #endif

-#if defined( KOKKOS_ENABLE_PTHREAD )
+#if defined( KOKKOS_ENABLE_THREADS )
  if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
      std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ||
      all_spaces ) {
@ -210,7 +237,7 @@ void fence_internal()
  }
 #endif

-#if defined( KOKKOS_ENABLE_PTHREAD )
+#if defined( KOKKOS_ENABLE_THREADS )
  if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
      std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
    Kokkos::Threads::fence();
@ -226,6 +253,39 @@ void fence_internal()

 }

+bool check_arg(char const* arg, char const* expected) {
+  std::size_t arg_len = std::strlen(arg);
+  std::size_t exp_len = std::strlen(expected);
+  if (arg_len < exp_len) return false;
+  if (std::strncmp(arg, expected, exp_len) != 0) return false;
+  if (arg_len == exp_len) return true;
+  /* if expected is "--threads", ignore "--threads-for-application"
+     by checking this character          ---------^
+     to see if it continues to make a longer name */
+  if (std::isalnum(arg[exp_len]) || arg[exp_len] == '-' || arg[exp_len] == '_') {
+    return false;
+  }
+  return true;
+}
+
+bool check_int_arg(char const* arg, char const* expected, int* value) {
+  if (!check_arg(arg, expected)) return false;
+  std::size_t arg_len = std::strlen(arg);
+  std::size_t exp_len = std::strlen(expected);
+  bool okay = true;
+  if (arg_len == exp_len || arg[exp_len] != '=') okay = false;
+  char const* number = arg + exp_len + 1;
+  if (!Impl::is_unsigned_int(number) || strlen(number) == 0) okay = false;
+  *value = std::atoi(number);
+  if (!okay) {
+    std::ostringstream ss;
+    ss << "Error: expecting an '=INT' after command line argument '" << expected << "'";
+    ss << ". Raised by Kokkos::initialize(int narg, char* argc[]).";
+    Impl::throw_runtime_exception( ss.str() );
+  }
+  return true;
+}
+
 } // namespace
 } // namespace Impl
 } // namespace Kokkos
@ -248,76 +308,31 @@ void initialize(int& narg, char* arg[])
    int iarg = 0;

    while (iarg < narg) {
-      if ((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || (strncmp(arg[iarg],"--threads",9) == 0)) {
-        //Find the number of threads (expecting --threads=XX)
-        if (!((strncmp(arg[iarg],"--kokkos-threads=",17) == 0) || (strncmp(arg[iarg],"--threads=",10) == 0)))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        char* number =  strchr(arg[iarg],'=')+1;
-
-        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        if((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || !kokkos_threads_found)
-          num_threads = atoi(number);
-
-        //Remove the --kokkos-threads argument from the list but leave --threads
-        if(strncmp(arg[iarg],"--kokkos-threads",16) == 0) {
-          for(int k=iarg;k<narg-1;k++) {
-            arg[k] = arg[k+1];
-          }
-          kokkos_threads_found=1;
-          narg--;
-        } else {
-          iarg++;
+      if (Impl::check_int_arg(arg[iarg], "--kokkos-threads", &num_threads)) {
+        for(int k=iarg;k<narg-1;k++) {
+          arg[k] = arg[k+1];
        }
-      } else if ((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || (strncmp(arg[iarg],"--numa",6) == 0)) {
-        //Find the number of numa (expecting --numa=XX)
-        if (!((strncmp(arg[iarg],"--kokkos-numa=",14) == 0) || (strncmp(arg[iarg],"--numa=",7) == 0)))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        char* number =  strchr(arg[iarg],'=')+1;
-
-        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        if((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || !kokkos_numa_found)
-          numa = atoi(number);
-
-        //Remove the --kokkos-numa argument from the list but leave --numa
-        if(strncmp(arg[iarg],"--kokkos-numa",13) == 0) {
-          for(int k=iarg;k<narg-1;k++) {
-            arg[k] = arg[k+1];
-          }
-          kokkos_numa_found=1;
-          narg--;
-        } else {
-          iarg++;
+        kokkos_threads_found=1;
+        narg--;
+      } else if (!kokkos_threads_found && Impl::check_int_arg(arg[iarg], "--threads", &num_threads)) {
+        iarg++;
+      } else if (Impl::check_int_arg(arg[iarg], "--kokkos-numa", &numa)) {
+        for(int k=iarg;k<narg-1;k++) {
+          arg[k] = arg[k+1];
        }
-      } else if ((strncmp(arg[iarg],"--kokkos-device",15) == 0) || (strncmp(arg[iarg],"--device",8) == 0)) {
-        //Find the number of device (expecting --device=XX)
-        if (!((strncmp(arg[iarg],"--kokkos-device=",16) == 0) || (strncmp(arg[iarg],"--device=",9) == 0)))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        char* number =  strchr(arg[iarg],'=')+1;
-
-        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        if((strncmp(arg[iarg],"--kokkos-device",15) == 0) || !kokkos_device_found)
-          device = atoi(number);
-
-        //Remove the --kokkos-device argument from the list but leave --device
-        if(strncmp(arg[iarg],"--kokkos-device",15) == 0) {
-          for(int k=iarg;k<narg-1;k++) {
-            arg[k] = arg[k+1];
-          }
-          kokkos_device_found=1;
-          narg--;
-        } else {
-          iarg++;
+        kokkos_numa_found=1;
+        narg--;
+      } else if (!kokkos_numa_found && Impl::check_int_arg(arg[iarg], "--numa", &numa)) {
+        iarg++;
+      } else if (Impl::check_int_arg(arg[iarg], "--kokkos-device", &device)) {
+        for(int k=iarg;k<narg-1;k++) {
+          arg[k] = arg[k+1];
        }
-      } else if ((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || (strncmp(arg[iarg],"--ndevices",10) == 0)) {
+        kokkos_device_found=1;
+        narg--;
+      } else if (!kokkos_device_found && Impl::check_int_arg(arg[iarg], "--device", &device)) {
+        iarg++;
+      } else if (Impl::check_arg(arg[iarg], "--kokkos-ndevices") || Impl::check_arg(arg[iarg], "--ndevices")) {

        //Find the number of device (expecting --device=XX)
        if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0)))
@ -511,20 +526,8 @@ void print_configuration( std::ostream & out , const bool detail )
 #else
  msg << "no" << std::endl;
 #endif
-  msg << "  KOKKOS_ENABLE_PTHREAD: ";
-#ifdef KOKKOS_ENABLE_PTHREAD
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_STDTHREAD: ";
-#ifdef KOKKOS_ENABLE_STDTHREAD
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_WINTHREAD: ";
-#ifdef KOKKOS_ENABLE_WINTHREAD
+  msg << "  KOKKOS_ENABLE_THREADS: ";
+#ifdef KOKKOS_ENABLE_THREADS
  msg << "yes" << std::endl;
 #else
  msg << "no" << std::endl;
@ -754,7 +757,7 @@ void print_configuration( std::ostream & out , const bool detail )
 #ifdef KOKKOS_ENABLE_OPENMP
  OpenMP::print_configuration(msg, detail);
 #endif
-#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( WINTHREAD )
+#if defined( KOKKOS_ENABLE_THREADS )
  Threads::print_configuration(msg, detail);
 #endif
 #ifdef KOKKOS_ENABLE_QTHREADS
--- a/lib/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,14 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>

 #include <ostream>
 #include <sstream>
@ -127,7 +127,8 @@ std::string human_memory_size(size_t arg_bytes)
 #include <execinfo.h>
 #include <cxxabi.h>
 #include <dlfcn.h>
-#include <stdlib.h>
+
+#include <cstdlib>

 namespace Kokkos {
 namespace Impl {
--- a/lib/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
@ -75,7 +75,9 @@ void abort( const char * const message ) {
 #ifdef __CUDA_ARCH__
  Kokkos::Impl::cuda_abort(message);
 #else
-  Kokkos::Impl::host_abort(message);
+  #ifndef KOKKOS_ENABLE_OPENMPTARGET
+    Kokkos::Impl::host_abort(message);
+  #endif
 #endif
 }

--- a/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp
@ -1,3 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
 #include <Kokkos_Core.hpp>
 namespace Kokkos {
 namespace Impl {
@ -17,3 +60,4 @@ Impl::PerThreadValue PerThread(const int& arg)
 }

 }
+
--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@ -55,6 +55,46 @@
 namespace Kokkos {
 namespace Impl {

+template< class FunctorType, class Enable = void>
+struct ReduceFunctorHasInit {
+  enum {value = false};
+};
+
+template< class FunctorType>
+struct ReduceFunctorHasInit<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type > {
+  enum {value = true};
+};
+
+template< class FunctorType, class Enable = void>
+struct ReduceFunctorHasJoin {
+  enum {value = false};
+};
+
+template< class FunctorType>
+struct ReduceFunctorHasJoin<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type > {
+  enum {value = true};
+};
+
+template< class FunctorType, class Enable = void>
+struct ReduceFunctorHasFinal {
+  enum {value = false};
+};
+
+template< class FunctorType>
+struct ReduceFunctorHasFinal<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type > {
+  enum {value = true};
+};
+
+template< class FunctorType, class Enable = void>
+  struct ReduceFunctorHasShmemSize {
+  enum {value = false};
+};
+
+template< class FunctorType>
+struct ReduceFunctorHasShmemSize<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type > {
+  enum {value = true};
+};
+
 template< class FunctorType , class ArgTag , class Enable = void >
 struct FunctorDeclaresValueType : public Impl::false_type {};

@ -63,6 +103,21 @@ struct FunctorDeclaresValueType< FunctorType , ArgTag
                               , typename Impl::enable_if_type< typename FunctorType::value_type >::type >
  : public Impl::true_type {};

+template< class FunctorType, bool Enable =
+      ( FunctorDeclaresValueType<FunctorType,void>::value) ||
+      ( ReduceFunctorHasInit<FunctorType>::value  ) ||
+      ( ReduceFunctorHasJoin<FunctorType>::value  ) ||
+      ( ReduceFunctorHasFinal<FunctorType>::value ) ||
+      ( ReduceFunctorHasShmemSize<FunctorType>::value )
+      >
+struct IsNonTrivialReduceFunctor {
+  enum {value = false};
+};
+
+template< class FunctorType>
+struct IsNonTrivialReduceFunctor<FunctorType, true> {
+  enum {value = true};
+};

 /** \brief  Query Functor and execution policy argument tag for value type.
 *
--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@ -48,7 +48,6 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Tags.hpp>
-#include <impl/Kokkos_Reducer.hpp>

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -101,6 +100,29 @@ private:
  using Tag  = typename has_work_tag<>::type ;
  using WTag = typename has_work_tag<>::wtag ;

+  //----------------------------------------
+  // Check for T::execution_space
+
+  template< typename T , typename = std::false_type >
+  struct has_execution_space { using type = void ; enum { value = false }; };
+
+  template< typename T >
+  struct has_execution_space
+    < T , typename std::is_same< typename T::execution_space , void >::type >
+  {
+    using type = typename T::execution_space ;
+    enum { value = true };
+  };
+
+  using policy_has_space  = has_execution_space< Policy > ;
+  using functor_has_space = has_execution_space< Functor > ;
+
+  static_assert( ! policy_has_space::value ||
+                 ! functor_has_space::value ||
+                 std::is_same< typename policy_has_space::type
+                             , typename functor_has_space::type >::value
+               , "Execution Policy and Functor execution space must match" );
+
  //----------------------------------------
  // Check for Functor::value_type, which is either a simple type T or T[]

@ -190,6 +212,16 @@ private:

 public:

+  using execution_space = typename std::conditional
+    < functor_has_space::value
+    , typename functor_has_space::type
+    , typename std::conditional
+      < policy_has_space::value
+      , typename policy_has_space::type
+      , Kokkos::DefaultExecutionSpace
+      >::type
+    >::type ;
+
  using value_type = typename std::remove_extent< candidate_type >::type ;

  static_assert( ! std::is_const< value_type >::value
@ -214,14 +246,14 @@ public:
 private:

  template< bool IsArray , class FF >
-  KOKKOS_INLINE_FUNCTION static
+  KOKKOS_INLINE_FUNCTION static constexpr
  typename std::enable_if< IsArray , unsigned >::type
  get_length( FF const & f ) { return f.value_count ; }

  template< bool IsArray , class FF >
-  KOKKOS_INLINE_FUNCTION static
+  KOKKOS_INLINE_FUNCTION static constexpr
  typename std::enable_if< ! IsArray , unsigned >::type
-  get_length( FF const & ) { return 1 ; }
+  get_length( FF const & ) { return candidate_is_void ? 0 : 1 ; }

 public:

@ -229,25 +261,25 @@ public:
                           ! candidate_is_array
                         ? sizeof(ValueType) : 0 };

-  KOKKOS_FORCEINLINE_FUNCTION static
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr
  unsigned value_count( const Functor & f )
    { return FunctorAnalysis::template get_length< candidate_is_array >(f); }

-  KOKKOS_FORCEINLINE_FUNCTION static
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr
  unsigned value_size( const Functor & f )
    { return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }

  //----------------------------------------

  template< class Unknown >
-  KOKKOS_FORCEINLINE_FUNCTION static
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr
  unsigned value_count( const Unknown & )
-    { return 1 ; }
+    { return candidate_is_void ? 0 : 1  ; }

  template< class Unknown >
-  KOKKOS_FORCEINLINE_FUNCTION static
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr
  unsigned value_size( const Unknown & )
-    { return sizeof(ValueType); }
+    { return candidate_is_void ? 0 : sizeof(ValueType); }

 private:

@ -277,16 +309,16 @@ private:
      typedef volatile const ValueType & cvref_type ;

      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( vref_type , cvref_type ) );
+      void enable_if( void (F::*)( vref_type , cvref_type ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( vref_type , cvref_type ) );

      KOKKOS_INLINE_FUNCTION static
-      void join( F const & f
+      void join( F const * const f
               , ValueType volatile * dst
               , ValueType volatile const * src )
-        { f.join( *dst , *src ); }
+        { f->join( *dst , *src ); }
    };

  template< class F >
@ -296,16 +328,16 @@ private:
      typedef volatile const ValueType * cvref_type ;

      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( vref_type , cvref_type ) );
+      void enable_if( void (F::*)( vref_type , cvref_type ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( vref_type , cvref_type ) );

      KOKKOS_INLINE_FUNCTION static
-      void join( F const & f
+      void join( F const * const f
               , ValueType volatile * dst
               , ValueType volatile const * src )
-        { f.join( dst , src ); }
+        { f->join( dst , src ); }
    };

  template< class F >
@ -315,22 +347,22 @@ private:
      typedef volatile const ValueType & cvref_type ;

      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( WTag , vref_type , cvref_type ) );

      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );

      KOKKOS_INLINE_FUNCTION static
-      void join( F const & f
+      void join( F const * const f
               , ValueType volatile * dst
               , ValueType volatile const * src )
-        { f.join( WTag() , *dst , *src ); }
+        { f->join( WTag() , *dst , *src ); }
    };

  template< class F >
@ -340,22 +372,22 @@ private:
      typedef volatile const ValueType * cvref_type ;

      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( WTag , vref_type , cvref_type ) );

      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );

      KOKKOS_INLINE_FUNCTION static
-      void join( F const & f
+      void join( F const * const f
               , ValueType volatile * dst
               , ValueType volatile const * src )
-        { f.join( WTag() , dst , src ); }
+        { f->join( WTag() , dst , src ); }
    };


@ -364,12 +396,14 @@ private:
          , typename  = void >
  struct DeduceJoin
    {
+      enum { value = false };
+
      KOKKOS_INLINE_FUNCTION static
-      void join( F const & f
+      void join( F const * const f
               , ValueType volatile * dst
               , ValueType volatile const * src )
       {
-         const int n = FunctorAnalysis::value_count( f );
+         const int n = FunctorAnalysis::value_count( *f );
         for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
       }
    };
@ -377,8 +411,10 @@ private:
  template< class F >
  struct DeduceJoin< F , DISABLE , void >
    {
+      enum { value = false };
+
      KOKKOS_INLINE_FUNCTION static
-      void join( F const &
+      void join( F const * const
               , ValueType volatile *
               , ValueType volatile const * ) {}
    };
@ -386,7 +422,8 @@ private:
  template< class F , INTERFACE I >
  struct DeduceJoin< F , I ,
    decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
-    : public has_join_function<F,I> {};
+    : public has_join_function<F,I>
+    { enum { value = true }; };

  //----------------------------------------

@ -397,38 +434,38 @@ private:
  struct has_init_function< F , NO_TAG_NOT_ARRAY >
    {
      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( ValueType & ) );
+      void enable_if( void (F::*)( ValueType & ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( ValueType & ) );

      KOKKOS_INLINE_FUNCTION static
-      void init( F const & f , ValueType * dst )
-        { f.init( *dst ); }
+      void init( F const * const f , ValueType * dst )
+        { f->init( *dst ); }
    };

  template< class F >
  struct has_init_function< F , NO_TAG_IS_ARRAY >
    {
      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( ValueType * ) );
+      void enable_if( void (F::*)( ValueType * ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( ValueType * ) );

      KOKKOS_INLINE_FUNCTION static
-      void init( F const & f , ValueType * dst )
-        { f.init( dst ); }
+      void init( F const * const f , ValueType * dst )
+        { f->init( dst ); }
    };

  template< class F >
  struct has_init_function< F , HAS_TAG_NOT_ARRAY >
    {
      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag , ValueType & ) );
+      void enable_if( void (F::*)( WTag , ValueType & ) const );

      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag const & , ValueType & ) );
+      void enable_if( void (F::*)( WTag const & , ValueType & ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( WTag , ValueType & ) );
@ -437,18 +474,18 @@ private:
      void enable_if( void (*)( WTag const & , ValueType & ) );

      KOKKOS_INLINE_FUNCTION static
-      void init( F const & f , ValueType * dst )
-        { f.init( WTag(), *dst ); }
+      void init( F const * const f , ValueType * dst )
+        { f->init( WTag(), *dst ); }
    };

  template< class F >
  struct has_init_function< F , HAS_TAG_IS_ARRAY >
    {
      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag , ValueType * ) );
+      void enable_if( void (F::*)( WTag , ValueType * ) const );

      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag const & , ValueType * ) );
+      void enable_if( void (F::*)( WTag const & , ValueType * ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( WTag , ValueType * ) );
@ -457,8 +494,8 @@ private:
      void enable_if( void (*)( WTag const & , ValueType * ) );

      KOKKOS_INLINE_FUNCTION static
-      void init( F const & f , ValueType * dst )
-        { f.init( WTag(), dst ); }
+      void init( F const * const f , ValueType * dst )
+        { f->init( WTag(), dst ); }
    };

  template< class F   = Functor
@ -466,87 +503,29 @@ private:
          , typename  = void >
  struct DeduceInit
    {
+      enum { value = false };
+
      KOKKOS_INLINE_FUNCTION static
-      void init( F const & , ValueType * dst ) { new(dst) ValueType(); }
+      void init( F const * const , ValueType * dst ) { new(dst) ValueType(); }
    };

  template< class F >
  struct DeduceInit< F , DISABLE , void >
    {
+      enum { value = false };
+
      KOKKOS_INLINE_FUNCTION static
-      void init( F const & , ValueType * ) {}
+      void init( F const * const , ValueType * ) {}
    };

  template< class F , INTERFACE I >
  struct DeduceInit< F , I ,
    decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
-    : public has_init_function<F,I> {};
+    : public has_init_function<F,I>
+    { enum { value = true }; };

  //----------------------------------------

-public:
-
-  struct Reducer
-  {
-  private:
-
-    Functor     const & m_functor ;
-    ValueType * const   m_result ;
-    int         const   m_length ;
-
-  public:
-
-    using reducer        = Reducer ;
-    using value_type     = FunctorAnalysis::value_type ;
-    using memory_space   = void ;
-    using reference_type = FunctorAnalysis::reference_type ;
-
-    KOKKOS_INLINE_FUNCTION
-    void join( ValueType volatile * dst
-             , ValueType volatile const * src ) const noexcept
-      { DeduceJoin<>::join( m_functor , dst , src ); }
-
-    KOKKOS_INLINE_FUNCTION
-    void init( ValueType * dst ) const noexcept
-      { DeduceInit<>::init( m_functor , dst ); }
-
-    KOKKOS_INLINE_FUNCTION explicit
-    constexpr Reducer( Functor const & arg_functor
-                     , ValueType     * arg_value = 0
-                     , int             arg_length = 0 ) noexcept
-      : m_functor( arg_functor ), m_result(arg_value), m_length(arg_length) {}
-
-    KOKKOS_INLINE_FUNCTION
-    constexpr int length() const noexcept { return m_length ; }
-
-    KOKKOS_INLINE_FUNCTION
-    ValueType & operator[]( int i ) const noexcept
-      { return m_result[i]; }
-
-  private:
-
-    template< bool IsArray >
-    constexpr
-    typename std::enable_if< IsArray , ValueType * >::type
-    ref() const noexcept { return m_result ; }
-
-    template< bool IsArray >
-    constexpr
-    typename std::enable_if< ! IsArray , ValueType & >::type
-    ref() const noexcept { return *m_result ; }
-
-  public:
-
-    KOKKOS_INLINE_FUNCTION
-    auto result() const noexcept
-      -> decltype( Reducer::template ref< candidate_is_array >() )
-      { return Reducer::template ref< candidate_is_array >(); }
- };
-
-  //----------------------------------------
-
-private:
-
  template< class , INTERFACE >
  struct has_final_function ;

@ -555,14 +534,14 @@ private:
  struct has_final_function< F , NO_TAG_NOT_ARRAY >
    {
      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( ValueType & ) );
+      void enable_if( void (F::*)( ValueType & ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( ValueType & ) );

      KOKKOS_INLINE_FUNCTION static
-      void final( F const & f , ValueType * dst )
-        { f.final( *dst ); }
+      void final( F const * const f , ValueType * dst )
+        { f->final( *dst ); }
    };

  // No tag, is array
@ -570,14 +549,14 @@ private:
  struct has_final_function< F , NO_TAG_IS_ARRAY >
    {
      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( ValueType * ) );
+      void enable_if( void (F::*)( ValueType * ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( ValueType * ) );

      KOKKOS_INLINE_FUNCTION static
-      void final( F const & f , ValueType * dst )
-        { f.final( dst ); }
+      void final( F const * const f , ValueType * dst )
+        { f->final( dst ); }
    };

  // Has tag, not array
@ -585,10 +564,10 @@ private:
  struct has_final_function< F , HAS_TAG_NOT_ARRAY >
    {
      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag , ValueType & ) );
+      void enable_if( void (F::*)( WTag , ValueType & ) const );

      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag const & , ValueType & ) );
+      void enable_if( void (F::*)( WTag const & , ValueType & ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( WTag , ValueType & ) );
@ -597,8 +576,8 @@ private:
      void enable_if( void (*)( WTag const & , ValueType & ) );

      KOKKOS_INLINE_FUNCTION static
-      void final( F const & f , ValueType * dst )
-        { f.final( WTag(), *dst ); }
+      void final( F const * const f , ValueType * dst )
+        { f->final( WTag(), *dst ); }
    };

  // Has tag, is array
@ -606,10 +585,10 @@ private:
  struct has_final_function< F , HAS_TAG_IS_ARRAY >
    {
      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag , ValueType * ) );
+      void enable_if( void (F::*)( WTag , ValueType * ) const );

      KOKKOS_INLINE_FUNCTION static
-      void enable_if( void (F::*)( WTag const & , ValueType * ) );
+      void enable_if( void (F::*)( WTag const & , ValueType * ) const );

      KOKKOS_INLINE_FUNCTION static
      void enable_if( void (*)( WTag , ValueType * ) );
@ -618,8 +597,8 @@ private:
      void enable_if( void (*)( WTag const & , ValueType * ) );

      KOKKOS_INLINE_FUNCTION static
-      void final( F const & f , ValueType * dst )
-        { f.final( WTag(), dst ); }
+      void final( F const * const f , ValueType * dst )
+        { f->final( WTag(), dst ); }
    };

  template< class F   = Functor
@ -627,20 +606,139 @@ private:
          , typename  = void >
  struct DeduceFinal
    {
+      enum { value = false };
+
      KOKKOS_INLINE_FUNCTION
-      static void final( F const & , ValueType * ) {}
+      static void final( F const * const , ValueType * ) {}
    };

  template< class F , INTERFACE I >
  struct DeduceFinal< F , I ,
    decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
-    : public has_init_function<F,I> {};
+    : public has_final_function<F,I>
+    { enum { value = true }; };
+
+  //----------------------------------------
+
+  template< class F = Functor , typename = void >
+  struct DeduceTeamShmem
+    {
+      enum { value = false };
+
+      static size_t team_shmem_size( F const & , int ) { return 0 ; }
+    };
+
+  template< class F >
+  struct DeduceTeamShmem< F , typename std::enable_if< 0 < sizeof( & F::team_shmem_size ) >::type >
+    {
+      enum { value = true };
+
+      static size_t team_shmem_size( F const * const f , int team_size )
+        { return f->team_shmem_size( team_size ); }
+    };
+
+  template< class F >
+  struct DeduceTeamShmem< F , typename std::enable_if< 0 < sizeof( & F::shmem_size ) >::type >
+    {
+      enum { value = true };
+
+      static size_t team_shmem_size( F const * const f , int team_size )
+        { return f->shmem_size( team_size ); }
+    };
+
+  //----------------------------------------

 public:

-  static void final( Functor const & f , ValueType * result )
-    { DeduceFinal<>::final( f , result ); }
+  inline static
+  size_t team_shmem_size( Functor const & f )
+    { return DeduceTeamShmem<>::team_shmem_size( f ); }

+  //----------------------------------------
+
+  enum { has_join_member_function  = DeduceJoin<>::value };
+  enum { has_init_member_function  = DeduceInit<>::value };
+  enum { has_final_member_function = DeduceFinal<>::value };
+
+
+  template< class MemorySpace = typename execution_space::memory_space >
+  struct Reducer
+  {
+  private:
+
+    Functor const * const m_functor ;
+    ValueType     * const m_result ;
+
+    template< bool IsArray >
+    KOKKOS_INLINE_FUNCTION constexpr
+    typename std::enable_if< IsArray , FunctorAnalysis::ValueType * >::type
+    ref() const noexcept { return m_result ; }
+
+    template< bool IsArray >
+    KOKKOS_INLINE_FUNCTION constexpr
+    typename std::enable_if< ! IsArray , FunctorAnalysis::ValueType & >::type
+    ref() const noexcept { return *m_result ; }
+
+    template< bool IsArray >
+    KOKKOS_INLINE_FUNCTION constexpr
+    typename std::enable_if< IsArray , int >::type
+    len() const noexcept { return m_functor->value_count ; }
+
+    template< bool IsArray >
+    KOKKOS_INLINE_FUNCTION constexpr
+    typename std::enable_if< ! IsArray , int >::type
+    len() const noexcept { return candidate_is_void ? 0 : 1 ; }
+
+  public:
+
+    using reducer        = Reducer ;
+    using value_type     = FunctorAnalysis::value_type ;
+    using memory_space   = MemorySpace ;
+    using reference_type = FunctorAnalysis::reference_type ;
+    using functor_type   = Functor ; // Adapts a functor
+
+    KOKKOS_INLINE_FUNCTION constexpr
+    value_type * data() const noexcept { return m_result ; }
+
+    KOKKOS_INLINE_FUNCTION constexpr
+    reference_type reference() const noexcept
+      { return Reducer::template ref< candidate_is_array >(); }
+
+    KOKKOS_INLINE_FUNCTION constexpr
+    int length() const noexcept
+      { return Reducer::template len< candidate_is_array >(); }
+
+    KOKKOS_INLINE_FUNCTION
+    void copy( ValueType * const dst
+             , ValueType const * const src ) const noexcept
+      { for ( int i = 0 ; i < Reducer::template len< candidate_is_array >() ; ++i ) dst[i] = src[i] ; }
+
+    KOKKOS_INLINE_FUNCTION
+    void join( ValueType volatile * dst
+             , ValueType volatile const * src ) const noexcept
+      { DeduceJoin<>::join( m_functor , dst , src ); }
+
+    KOKKOS_INLINE_FUNCTION 
+    void init( ValueType * dst ) const noexcept
+      { DeduceInit<>::init( m_functor , dst ); }
+
+    KOKKOS_INLINE_FUNCTION
+    void final( ValueType * dst ) const noexcept
+      { DeduceFinal<>::final( m_functor , dst ); }
+
+    Reducer( Reducer const & ) = default ;
+    Reducer( Reducer && ) = default ;
+    Reducer & operator = ( Reducer const & ) = delete ;
+    Reducer & operator = ( Reducer && ) = delete ;
+
+    template< class S >
+    using rebind = Reducer< S > ;
+
+    KOKKOS_INLINE_FUNCTION explicit constexpr
+    Reducer( Functor const * arg_functor = 0
+           , ValueType * arg_value = 0 ) noexcept
+      : m_functor(arg_functor), m_result(arg_value) {}
+  };
 };

 } // namespace Impl
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@ -44,11 +44,10 @@

 #include <Kokkos_Macros.hpp>

-
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <memory.h>
+#include <cstddef>
+#include <cstdlib>
+#include <cstdint>
+#include <cstring>

 #include <iostream>
 #include <sstream>
@ -397,3 +396,4 @@ void unlock_address_hbw_space(void* ptr) {
 }
 }
 #endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@ -46,6 +46,7 @@
 #if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif
+
 /*--------------------------------------------------------------------------*/

 #if defined( __INTEL_COMPILER ) && ! defined ( KOKKOS_ENABLE_CUDA )
@ -85,10 +86,10 @@

 /*--------------------------------------------------------------------------*/

-#include <stddef.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <memory.h>
+#include <cstddef>
+#include <cstdlib>
+#include <cstdint>
+#include <cstring>

 #include <iostream>
 #include <sstream>
@ -98,6 +99,10 @@
 #include <impl/Kokkos_Error.hpp>
 #include <Kokkos_Atomic.hpp>

+#if ( defined( KOKKOS_ENABLE_ASM ) || defined ( KOKKOS_ENABLE_TM ) ) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+#include <immintrin.h>
+#endif
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

@ -490,16 +495,50 @@ void init_lock_array_host_space() {
 }

 bool lock_address_host_space(void* ptr) {
+#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
+  const unsigned status = _xbegin();
+
+  if( _XBEGIN_STARTED == status ) {
+	const int val = HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
+		HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK];
+
+	if( 0 == val ) {
+		HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
+                   HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] = 1;
+	} else {
+		_xabort( 1 );
+	}
+
+	_xend();
+
+	return 1;
+  } else {
+#endif
  return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
                                  0 , 1);
+#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
+  }
+#endif
 }

 void unlock_address_host_space(void* ptr) {
+#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
+  const unsigned status = _xbegin();
+
+  if( _XBEGIN_STARTED == status ) {
+	HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
+        	HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] = 0;
+  } else {
+#endif
   atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
                    0);
+#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
+  }
+#endif
 }

 }
 }
+
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@ -49,7 +49,6 @@
 #include <Kokkos_Atomic.hpp>
 #include <Kokkos_ExecPolicy.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
-#include <impl/Kokkos_Reducer.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>

 //----------------------------------------------------------------------------
@ -507,8 +506,9 @@ public:
  const scratch_memory_space & thread_scratch(int) const
    { return m_scratch.set_team_thread_mode(0,m_data.m_team_size,m_data.m_team_rank); }

-  //----------------------------------------
+  //--------------------------------------------------------------------------
  // Team collectives
+  //--------------------------------------------------------------------------

  KOKKOS_INLINE_FUNCTION void team_barrier() const noexcept
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
@ -519,22 +519,6 @@ public:
    {}
 #endif

-  template< class Closure >
-  KOKKOS_INLINE_FUNCTION
-  void team_barrier( Closure const & f ) const noexcept
-    {
-      if ( m_data.team_rendezvous() ) {
-
-        // All threads have entered 'team_rendezvous'
-        // only this thread returned from 'team_rendezvous'
-        // with a return value of 'true'
-
-        f();
-
-        m_data.team_rendezvous_release();
-      }
-    }
-
  //--------------------------------------------------------------------------

  template< typename T >
@ -613,8 +597,9 @@ public:

        if ( 0 != m_data.m_team_rank ) {
          // Non-root copies to their local buffer:
-          reducer.copy( (value_type*) m_data.team_reduce_local()
-                      , reducer.data() );
+          /*reducer.copy( (value_type*) m_data.team_reduce_local()
+                      , reducer.data() );*/
+          *((value_type*) m_data.team_reduce_local()) = reducer.reference();
        }

        // Root does not overwrite shared memory until all threads arrive
@ -630,19 +615,19 @@ public:
            value_type * const src =
              (value_type*) m_data.team_member(i)->team_reduce_local();

-            reducer.join( reducer.data() , src );
+            reducer.join( reducer.reference(), *src);
          }

          // Copy result to root member's buffer:
-          reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
-
+          // reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
+          *((value_type*) m_data.team_reduce()) = reducer.reference();
          m_data.team_rendezvous_release();
          // This thread released all other threads from 'team_rendezvous'
          // with a return value of 'false'
        }
        else {
          // Copy from root member's buffer:
-          reducer.copy( reducer.data() , (value_type*) m_data.team_reduce() );
+          reducer.reference() = *((value_type*) m_data.team_reduce());
        }
      }
    }
@ -652,7 +637,7 @@ public:

  //--------------------------------------------------------------------------

-  template< typename ValueType , class JoinOp >
+  /*template< typename ValueType , class JoinOp >
  KOKKOS_INLINE_FUNCTION
  ValueType
  team_reduce( ValueType const & value
@ -696,7 +681,7 @@ public:
    }
 #else
    { Kokkos::abort("HostThreadTeamMember team_reduce\n"); return ValueType(); }
-#endif
+#endif*/


  template< typename T >
@ -854,7 +839,7 @@ parallel_reduce
  , Reducer  const & reducer
  )
 {
-  reducer.init( reducer.data() );
+  reducer.init( reducer.reference() );

  for( iType i = loop_boundaries.start
     ; i <  loop_boundaries.end
@ -875,9 +860,9 @@ parallel_reduce
  , ValueType      & result
  )
 {
-  Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > > reducer( & result );
+  Kokkos::Experimental::Sum<ValueType> reducer( result );

-  reducer.init( reducer.data() );
+  reducer.init( result );

  for( iType i = loop_boundaries.start
     ; i <  loop_boundaries.end
@ -888,7 +873,7 @@ parallel_reduce
  loop_boundaries.thread.team_reduce( reducer );
 }

-template< typename iType, class Space
+/*template< typename iType, class Space
         , class Closure, class Joiner , typename ValueType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
@ -910,7 +895,7 @@ void parallel_reduce
  }

  loop_boundaries.thread.team_reduce( reducer );
-}
+}*/

 //----------------------------------------------------------------------------
 /** \brief  Inter-thread vector parallel_reduce.
@ -923,15 +908,13 @@ void parallel_reduce
 */
 template< typename iType, class Space , class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce
+typename std::enable_if< ! Kokkos::is_reducer<ValueType>::value >::type
+parallel_reduce
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
   const Lambda & lambda,
   ValueType& result)
 {
  result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
  for( iType i =  loop_boundaries.start ;
             i <  loop_boundaries.end ;
             i += loop_boundaries.increment) {
@ -939,6 +922,22 @@ void parallel_reduce
  }
 }

+template< typename iType, class Space , class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
+   const Lambda & lambda,
+   const ReducerType& reducer)
+{
+  reducer.init(reducer.reference());
+  for( iType i =  loop_boundaries.start ;
+             i <  loop_boundaries.end ;
+             i += loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+}
+
 /** \brief  Intra-thread vector parallel_reduce.
 *
 *  Executes lambda(iType i, ValueType & val) for each i=[0..N)
@ -961,9 +960,6 @@ void parallel_reduce
   const JoinType & join,
   ValueType& result)
 {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
  for( iType i =  loop_boundaries.start ;
             i <  loop_boundaries.end ;
             i += loop_boundaries.increment ) {
@ -1055,9 +1051,8 @@ template< class Space , class FunctorType >
 KOKKOS_INLINE_FUNCTION
 void single( const Impl::ThreadSingleStruct< Impl::HostThreadTeamMember<Space> > & single , const FunctorType & functor )
 {
-  if ( single.team_member.team_rank() == 0 ) functor();
  // 'single' does not perform a barrier.
-  // single.team_member.team_barrier( functor );
+  if ( single.team_member.team_rank() == 0 ) functor();
 }

 template< class Space , class FunctorType , typename ValueType >
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@ -41,6 +41,7 @@
 //@HEADER
 */

+#include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE_HPP )
 #define KOKKOS_MEMORY_FENCE_HPP
 namespace Kokkos {
@ -108,4 +109,3 @@ void load_fence()

 #endif

-
--- a/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
@ -212,6 +212,12 @@
 #endif
 #endif

+#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+#ifndef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
+#define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+#endif
+#endif
+
 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
 #ifndef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
 #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT KOKKOS_HAVE_PRAGMA_LOOPCOUNT
@ -423,6 +429,12 @@
 #endif
 #endif

+#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD )
+#ifndef KOKKOS_ENABLE_THREADS
+#define KOKKOS_ENABLE_THREADS
+#endif
+#endif
+
 //------------------------------------------------------------------------------
 // Deprecated macros
 //------------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -44,13 +44,11 @@
 #ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
 #define KOKKOS_PHYSICAL_LAYOUT_HPP

-
 #include <Kokkos_View.hpp>
+
 namespace Kokkos {
 namespace Impl {

-
-
 struct PhysicalLayout {
  enum LayoutType {Left,Right,Scalar,Error};
  LayoutType layout_type;
@ -71,3 +69,4 @@ struct PhysicalLayout {
 }
 }
 #endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
@ -44,12 +44,14 @@
 #ifndef KOKKOSP_DEVICE_INFO_HPP
 #define KOKKOSP_DEVICE_INFO_HPP

+#include <cstdint>
+
 namespace Kokkos {
 namespace Profiling {

-    struct KokkosPDeviceInfo {
-        uint32_t deviceID;
-    };
+struct KokkosPDeviceInfo {
+  uint32_t deviceID;
+};

 }
 }
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@ -41,197 +41,203 @@
 //@HEADER
 */

-#include <impl/Kokkos_Profiling_Interface.hpp>
-
+#include <Kokkos_Macros.hpp>
 #if defined(KOKKOS_ENABLE_PROFILING)
-#include <string.h>
+
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <cstring>

 namespace Kokkos {
-  namespace Profiling {
+namespace Profiling {

-    SpaceHandle::SpaceHandle(const char* space_name) {
-      strncpy(name,space_name,64);
-    }
+SpaceHandle::SpaceHandle(const char* space_name) {
+  strncpy(name,space_name,64);
+}

-    bool profileLibraryLoaded() {
-       	return (NULL != initProfileLibrary);
-    }
+bool profileLibraryLoaded() {
+  return (NULL != initProfileLibrary);
+}

-    void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
-        if(NULL != beginForCallee) {
-            Kokkos::fence();
-            (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
-        }
-    }
-
-    void endParallelFor(const uint64_t kernelID) {
-        if(NULL != endForCallee) {
-            Kokkos::fence();
-            (*endForCallee)(kernelID);
-        }
-    }
-
-    void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
-        if(NULL != beginScanCallee) {
-            Kokkos::fence();
-            (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
-        }
-    }
-
-    void endParallelScan(const uint64_t kernelID) {
-        if(NULL != endScanCallee) {
-            Kokkos::fence();
-            (*endScanCallee)(kernelID);
-        }
-    }
-
-    void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
-        if(NULL != beginReduceCallee) {
-            Kokkos::fence();
-            (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
-        }
-    }
-
-    void endParallelReduce(const uint64_t kernelID) {
-        if(NULL != endReduceCallee) {
-            Kokkos::fence();
-            (*endReduceCallee)(kernelID);
-        }
-    }
-
-
-    void pushRegion(const std::string& kName) {
-      if( NULL != pushRegionCallee ) {
-        Kokkos::fence();
-        (*pushRegionCallee)(kName.c_str());
-      }
-    }
-
-    void popRegion() {
-      if( NULL != popRegionCallee ) {
-        Kokkos::fence();
-        (*popRegionCallee)();
-      }
-    }
-
-    void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
-        if(NULL != allocateDataCallee) {
-            (*allocateDataCallee)(space,label.c_str(),ptr,size);
-        }
-    }
-
-    void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
-        if(NULL != allocateDataCallee) {
-            (*deallocateDataCallee)(space,label.c_str(),ptr,size);
-        }
-    }
-
-    void initialize() {
-
-        // Make sure initialize calls happens only once
-        static int is_initialized = 0;
-        if(is_initialized) return;
-        is_initialized = 1;
-
-        void* firstProfileLibrary;
-
-        char* envProfileLibrary  = getenv("KOKKOS_PROFILE_LIBRARY");
-
-	// If we do not find a profiling library in the environment then exit
-	// early.
-	if( NULL == envProfileLibrary ) {
-		return ;
-	}
-
-		char* envProfileCopy = (char*) malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
-		sprintf(envProfileCopy, "%s", envProfileLibrary);
-
-		char* profileLibraryName = strtok(envProfileCopy, ";");
-
-        if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
-            firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
-
-            if(NULL == firstProfileLibrary) {
-                std::cerr << "Error: Unable to load KokkosP library: " <<
-                profileLibraryName << std::endl;
-            } else {
-                std::cout << "KokkosP: Library Loaded: " << profileLibraryName << std::endl;
-
-                // dlsym returns a pointer to an object, while we want to assign to pointer to function
-                // A direct cast will give warnings hence, we have to workaround the issue by casting pointer to pointers.
-                auto p1 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
-                beginForCallee = *((beginFunction*) &p1);
-                auto p2 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
-                beginScanCallee = *((beginFunction*) &p2);
-                auto p3 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
-                beginReduceCallee = *((beginFunction*) &p3);
-
-                auto p4 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
-                endScanCallee = *((endFunction*) &p4);
-                auto p5 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
-                endForCallee = *((endFunction*) &p5);
-                auto p6 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
-                endReduceCallee = *((endFunction*) &p6);
-
-                auto p7 = dlsym(firstProfileLibrary, "kokkosp_init_library");
-                initProfileLibrary = *((initFunction*) &p7);
-                auto p8 = dlsym(firstProfileLibrary, "kokkosp_finalize_library");
-                finalizeProfileLibrary = *((finalizeFunction*) &p8);
-
-                auto p9 = dlsym(firstProfileLibrary, "kokkosp_push_profile_region");
-                pushRegionCallee = *((pushFunction*) &p9);
-                auto p10 = dlsym(firstProfileLibrary, "kokkosp_pop_profile_region");
-                popRegionCallee = *((popFunction*) &p10);
-
-                auto p11 = dlsym(firstProfileLibrary, "kokkosp_allocate_data");
-                allocateDataCallee = *((allocateDataFunction*) &p11);
-                auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
-                deallocateDataCallee = *((deallocateDataFunction*) &p12);
-
-            }
-        }
-
-        if(NULL != initProfileLibrary) {
-            (*initProfileLibrary)(0,
-			(uint64_t) KOKKOSP_INTERFACE_VERSION,
-			(uint32_t) 0,
-			NULL);
-        }
-
-		free(envProfileCopy);
-    }
-
-    void finalize() {
-      // Make sure finalize calls happens only once
-      static int is_finalized = 0;
-      if(is_finalized) return;
-      is_finalized = 1;
-
-      if(NULL != finalizeProfileLibrary) {
-        (*finalizeProfileLibrary)();
-
-        // Set all profile hooks to NULL to prevent
-        // any additional calls. Once we are told to
-        // finalize, we mean it
-        initProfileLibrary = NULL;
-        finalizeProfileLibrary = NULL;
-
-        beginForCallee = NULL;
-        beginScanCallee = NULL;
-        beginReduceCallee = NULL;
-        endScanCallee = NULL;
-        endForCallee = NULL;
-        endReduceCallee = NULL;
-
-        pushRegionCallee = NULL;
-        popRegionCallee = NULL;
-
-        allocateDataCallee = NULL;
-        deallocateDataCallee = NULL;
-
-      }
-    }
+void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+  if(NULL != beginForCallee) {
+    Kokkos::fence();
+    (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
  }
 }

+void endParallelFor(const uint64_t kernelID) {
+  if(NULL != endForCallee) {
+    Kokkos::fence();
+    (*endForCallee)(kernelID);
+  }
+}
+
+void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+  if(NULL != beginScanCallee) {
+    Kokkos::fence();
+    (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
+  }
+}
+
+void endParallelScan(const uint64_t kernelID) {
+  if(NULL != endScanCallee) {
+    Kokkos::fence();
+    (*endScanCallee)(kernelID);
+  }
+}
+
+void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+  if(NULL != beginReduceCallee) {
+    Kokkos::fence();
+    (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
+  }
+}
+
+void endParallelReduce(const uint64_t kernelID) {
+  if(NULL != endReduceCallee) {
+    Kokkos::fence();
+    (*endReduceCallee)(kernelID);
+  }
+}
+
+
+void pushRegion(const std::string& kName) {
+  if( NULL != pushRegionCallee ) {
+    Kokkos::fence();
+    (*pushRegionCallee)(kName.c_str());
+  }
+}
+
+void popRegion() {
+  if( NULL != popRegionCallee ) {
+    Kokkos::fence();
+    (*popRegionCallee)();
+  }
+}
+
+void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
+  if(NULL != allocateDataCallee) {
+    (*allocateDataCallee)(space,label.c_str(),ptr,size);
+  }
+}
+
+void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
+  if(NULL != allocateDataCallee) {
+    (*deallocateDataCallee)(space,label.c_str(),ptr,size);
+  }
+}
+
+void initialize() {
+
+  // Make sure initialize calls happens only once
+  static int is_initialized = 0;
+  if(is_initialized) return;
+  is_initialized = 1;
+
+  void* firstProfileLibrary;
+
+  char* envProfileLibrary  = getenv("KOKKOS_PROFILE_LIBRARY");
+
+  // If we do not find a profiling library in the environment then exit
+  // early.
+  if( NULL == envProfileLibrary ) {
+    return ;
+  }
+
+  char* envProfileCopy = (char*) malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
+  sprintf(envProfileCopy, "%s", envProfileLibrary);
+
+  char* profileLibraryName = strtok(envProfileCopy, ";");
+
+  if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
+    firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
+
+    if(NULL == firstProfileLibrary) {
+      std::cerr << "Error: Unable to load KokkosP library: " <<
+        profileLibraryName << std::endl;
+    } else {
+#ifdef KOKKOS_ENABLE_PROFILING_LOAD_PRINT
+      std::cout << "KokkosP: Library Loaded: " << profileLibraryName << std::endl;
 #endif
+
+      // dlsym returns a pointer to an object, while we want to assign to pointer to function
+      // A direct cast will give warnings hence, we have to workaround the issue by casting pointer to pointers.
+      auto p1 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
+      beginForCallee = *((beginFunction*) &p1);
+      auto p2 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
+      beginScanCallee = *((beginFunction*) &p2);
+      auto p3 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
+      beginReduceCallee = *((beginFunction*) &p3);
+
+      auto p4 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
+      endScanCallee = *((endFunction*) &p4);
+      auto p5 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
+      endForCallee = *((endFunction*) &p5);
+      auto p6 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
+      endReduceCallee = *((endFunction*) &p6);
+
+      auto p7 = dlsym(firstProfileLibrary, "kokkosp_init_library");
+      initProfileLibrary = *((initFunction*) &p7);
+      auto p8 = dlsym(firstProfileLibrary, "kokkosp_finalize_library");
+      finalizeProfileLibrary = *((finalizeFunction*) &p8);
+
+      auto p9 = dlsym(firstProfileLibrary, "kokkosp_push_profile_region");
+      pushRegionCallee = *((pushFunction*) &p9);
+      auto p10 = dlsym(firstProfileLibrary, "kokkosp_pop_profile_region");
+      popRegionCallee = *((popFunction*) &p10);
+
+      auto p11 = dlsym(firstProfileLibrary, "kokkosp_allocate_data");
+      allocateDataCallee = *((allocateDataFunction*) &p11);
+      auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
+      deallocateDataCallee = *((deallocateDataFunction*) &p12);
+
+    }
+  }
+
+  if(NULL != initProfileLibrary) {
+    (*initProfileLibrary)(0,
+        (uint64_t) KOKKOSP_INTERFACE_VERSION,
+        (uint32_t) 0,
+        NULL);
+  }
+
+  free(envProfileCopy);
+}
+
+void finalize() {
+  // Make sure finalize calls happens only once
+  static int is_finalized = 0;
+  if(is_finalized) return;
+  is_finalized = 1;
+
+  if(NULL != finalizeProfileLibrary) {
+    (*finalizeProfileLibrary)();
+
+    // Set all profile hooks to NULL to prevent
+    // any additional calls. Once we are told to
+    // finalize, we mean it
+    initProfileLibrary = NULL;
+    finalizeProfileLibrary = NULL;
+
+    beginForCallee = NULL;
+    beginScanCallee = NULL;
+    beginReduceCallee = NULL;
+    endScanCallee = NULL;
+    endForCallee = NULL;
+    endReduceCallee = NULL;
+
+    pushRegionCallee = NULL;
+    popRegionCallee = NULL;
+
+    allocateDataCallee = NULL;
+    deallocateDataCallee = NULL;
+
+  }
+}
+}
+}
+
+#else
+void KOKKOS_CORE_SRC_IMPL_PROFILING_INTERFACE_PREVENT_LINK_ERROR() {}
+#endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@ -44,108 +44,108 @@
 #ifndef KOKKOSP_INTERFACE_HPP
 #define KOKKOSP_INTERFACE_HPP

+#include <Kokkos_Macros.hpp>
+#if defined(KOKKOS_ENABLE_PROFILING)
+
 #include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_Macros.hpp>
 #include <string>
 #include <cinttypes>

-#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_DeviceInfo.hpp>
 #include <dlfcn.h>
 #include <iostream>
-#include <stdlib.h>
-#endif
+#include <cstdlib>

 #define KOKKOSP_INTERFACE_VERSION 20150628

-#if defined(KOKKOS_ENABLE_PROFILING)
 namespace Kokkos {
-  namespace Profiling {
+namespace Profiling {

-    struct SpaceHandle {
-      SpaceHandle(const char* space_name);
-      char name[64];
-    };
+struct SpaceHandle {
+  SpaceHandle(const char* space_name);
+  char name[64];
+};

-    typedef void (*initFunction)(const int,
-	const uint64_t,
-	const uint32_t,
-	KokkosPDeviceInfo*);
-    typedef void (*finalizeFunction)();
-    typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
-    typedef void (*endFunction)(uint64_t);
+typedef void (*initFunction)(const int,
+                             const uint64_t,
+                             const uint32_t,
+                             KokkosPDeviceInfo*);
+typedef void (*finalizeFunction)();
+typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
+typedef void (*endFunction)(uint64_t);

-    typedef void (*pushFunction)(const char*);
-    typedef void (*popFunction)();
+typedef void (*pushFunction)(const char*);
+typedef void (*popFunction)();

-    typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
-    typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
+typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
+typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);


-    static initFunction initProfileLibrary = NULL;
-    static finalizeFunction finalizeProfileLibrary = NULL;
+static initFunction initProfileLibrary = NULL;
+static finalizeFunction finalizeProfileLibrary = NULL;

-    static beginFunction beginForCallee = NULL;
-    static beginFunction beginScanCallee = NULL;
-    static beginFunction beginReduceCallee = NULL;
-    static endFunction endForCallee = NULL;
-    static endFunction endScanCallee = NULL;
-    static endFunction endReduceCallee = NULL;
+static beginFunction beginForCallee = NULL;
+static beginFunction beginScanCallee = NULL;
+static beginFunction beginReduceCallee = NULL;
+static endFunction endForCallee = NULL;
+static endFunction endScanCallee = NULL;
+static endFunction endReduceCallee = NULL;

-    static pushFunction pushRegionCallee = NULL;
-    static popFunction popRegionCallee = NULL;
+static pushFunction pushRegionCallee = NULL;
+static popFunction popRegionCallee = NULL;

-    static allocateDataFunction allocateDataCallee = NULL;
-    static deallocateDataFunction deallocateDataCallee = NULL;
+static allocateDataFunction allocateDataCallee = NULL;
+static deallocateDataFunction deallocateDataCallee = NULL;


-    bool profileLibraryLoaded();
+bool profileLibraryLoaded();

-    void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
-    void endParallelFor(const uint64_t kernelID);
-    void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
-    void endParallelScan(const uint64_t kernelID);
-    void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
-    void endParallelReduce(const uint64_t kernelID);
+void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+void endParallelFor(const uint64_t kernelID);
+void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+void endParallelScan(const uint64_t kernelID);
+void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+void endParallelReduce(const uint64_t kernelID);

-    void pushRegion(const std::string& kName);
-    void popRegion();
+void pushRegion(const std::string& kName);
+void popRegion();

-    void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
-    void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
+void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
+void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);

-    void initialize();
-    void finalize();
+void initialize();
+void finalize();

-    //Define finalize_fake inline to get rid of warnings for unused static variables
-    inline void finalize_fake() {
-      if(NULL != finalizeProfileLibrary) {
-        (*finalizeProfileLibrary)();
+//Define finalize_fake inline to get rid of warnings for unused static variables
+inline void finalize_fake() {
+  if(NULL != finalizeProfileLibrary) {
+    (*finalizeProfileLibrary)();

-        // Set all profile hooks to NULL to prevent
-        // any additional calls. Once we are told to
-        // finalize, we mean it
-        beginForCallee = NULL;
-        beginScanCallee = NULL;
-        beginReduceCallee = NULL;
-        endScanCallee = NULL;
-        endForCallee = NULL;
-        endReduceCallee = NULL;
-
-        allocateDataCallee = NULL;
-        deallocateDataCallee = NULL;
-
-        initProfileLibrary = NULL;
-        finalizeProfileLibrary = NULL;
-        pushRegionCallee = NULL;
-        popRegionCallee = NULL;
-      }
-    }
+    // Set all profile hooks to NULL to prevent
+    // any additional calls. Once we are told to
+    // finalize, we mean it
+    beginForCallee = NULL;
+    beginScanCallee = NULL;
+    beginReduceCallee = NULL;
+    endScanCallee = NULL;
+    endForCallee = NULL;
+    endReduceCallee = NULL;

+    allocateDataCallee = NULL;
+    deallocateDataCallee = NULL;

+    initProfileLibrary = NULL;
+    finalizeProfileLibrary = NULL;
+    pushRegionCallee = NULL;
+    popRegionCallee = NULL;
  }
 }

+
+}
+}
+
 #endif
 #endif
+
--- a/Show More
+++ b/Show More
				`@ -195,3 +195,4 @@ struct PolicyTraits`


				`#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP`