git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12370 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2014-08-27 17:08:01 +00:00
parent 222d9ee151
commit 048a267b9f
150 changed files with 57924 additions and 0 deletions
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -0,0 +1,235 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDAEXEC_HPP
+#define KOKKOS_CUDAEXEC_HPP
+
+#include <string>
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaTraits {
+  enum { WarpSize       = 32      /* 0x0020 */ };
+  enum { WarpIndexMask  = 0x001f  /* Mask for warpindex */ };
+  enum { WarpIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
+
+  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
+  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
+  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
+
+  enum { UpperBoundGridCount    = 65535 /* Hard upper bound */ };
+  enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
+  enum { ConstantMemoryUsage    = 0x008000 /* 32k bytes */ };
+  enum { ConstantMemoryCache    = 0x002000 /*  8k bytes */ };
+
+  typedef unsigned long
+    ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
+
+  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_count( CudaSpace::size_type i )
+    { return ( i + WarpIndexMask ) >> WarpIndexShift ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_align( CudaSpace::size_type i )
+    {
+      enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
+      return ( i + WarpIndexMask ) & Mask ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+CudaSpace::size_type cuda_internal_maximum_warp_count();
+CudaSpace::size_type cuda_internal_maximum_grid_count();
+CudaSpace::size_type cuda_internal_maximum_shared_words();
+
+CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ )
+
+/** \brief  Access to constant memory on the device */
+#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+extern
+#endif
+__device__ __constant__
+Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
+kokkos_impl_cuda_constant_memory_buffer ;
+
+template< typename T >
+inline
+__device__
+T * kokkos_impl_cuda_shared_memory()
+{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize L1 cache and minimize shared memory:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
+// For 2.0 capability: 48 KB L1 and 16 KB shared
+//----------------------------------------------------------------------------
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template < class DriverType ,
+           bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
+struct CudaParallelLaunch ;
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType , true > {
+
+  inline
+  CudaParallelLaunch( const DriverType & driver ,
+                      const dim3       & grid ,
+                      const dim3       & block ,
+                      const int          shmem )
+  {
+    if ( grid.x && block.x ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      else if ( shmem ) {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
+      } else {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
+      }
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem >>>();
+
+#if defined( KOKKOS_EXPRESSION_CHECK )
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType , false > {
+
+  inline
+  CudaParallelLaunch( const DriverType & driver ,
+                      const dim3       & grid ,
+                      const dim3       & block ,
+                      const int          shmem )
+  {
+    if ( grid.x && block.x ) {
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      else if ( shmem ) {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
+      } else {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
+      }
+
+      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem >>>( driver );
+
+#if defined( KOKKOS_EXPRESSION_CHECK )
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cu
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cu
@ -0,0 +1,343 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include <Kokkos_Cuda.hpp>
+#include <Kokkos_CudaSpace.hpp>
+
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_MemoryTracking.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+DeepCopy<HostSpace,CudaSpace>
+  ::DeepCopy( void * dst , const void * src , size_t n )
+{
+  CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
+}
+
+DeepCopy<CudaSpace,HostSpace>
+  ::DeepCopy( void * dst , const void * src , size_t n )
+{
+  CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
+}
+
+DeepCopy<CudaSpace,CudaSpace>
+  ::DeepCopy( void * dst , const void * src , size_t n )
+{
+  CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+
+class CudaMemoryTrackingEntry : public Impl::MemoryTrackingEntry
+{
+public:
+
+  void * const                    ptr_alloc ;
+  const size_t                    size ;
+  const size_t                    count ;
+  Impl::cuda_texture_object_type  tex_obj ;
+
+  CudaMemoryTrackingEntry( const std::string & arg_label ,
+                           const std::type_info & arg_info ,
+                           void * const           arg_ptr ,
+                           const size_t           arg_size ,
+                           const size_t           arg_count )
+    : Impl::MemoryTrackingEntry( arg_label , arg_info , arg_ptr , arg_size * arg_count )
+    , ptr_alloc( arg_ptr )
+    , size( arg_size )
+    , count( arg_count )
+    , tex_obj( 0 )
+    {}
+
+  ~CudaMemoryTrackingEntry();
+};
+
+CudaMemoryTrackingEntry::~CudaMemoryTrackingEntry()
+{
+  std::ostringstream oss;
+  bool error = false;
+  try {
+    Kokkos::Impl::cuda_device_synchronize();
+  }
+  catch(std::runtime_error & err) {
+    error = true;
+    oss << err.what() << std::endl;
+  }
+
+  if ( tex_obj ) {
+
+  }
+
+  try {
+    CUDA_SAFE_CALL( cudaFree( ptr_alloc ) );
+  }
+  catch(std::runtime_error & err) {
+    error = true;
+    oss << err.what() << std::endl;
+  }
+
+  if ( error ) {
+    std::cerr << "cudaFree( " << ptr_alloc << " ) FAILED for " ;
+    Impl::MemoryTrackingEntry::print( std::cerr );
+    std::cerr << oss.str() << std::endl;
+  }
+}
+
+Impl::MemoryTracking & cuda_space_singleton()
+{
+  static Impl::MemoryTracking self("Kokkos::CudaSpace");
+  return self ;
+}
+
+bool cuda_space_verify_modifiable( const char * const label )
+{
+  static const char error_in_parallel[] = "Called with HostSpace::in_parallel()" ;
+  static const char error_not_exists[]  = "Called after return from main()" ;
+
+  const char * const error_msg =
+    HostSpace::in_parallel() ? error_in_parallel : (
+    ! cuda_space_singleton().exists() ? error_not_exists : (const char *) 0 );
+
+  if ( error_msg ) {
+    std::cerr << "Kokkos::CudaSpace::" << label << " ERROR : " << error_msg << std::endl ;
+  }
+
+  return error_msg == 0  ;
+}
+
+}
+
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+
+void * CudaSpace::allocate(
+  const std::string    & label ,
+  const std::type_info & scalar_type ,
+  const size_t           scalar_size ,
+  const size_t           scalar_count )
+{
+  void * ptr = 0 ;
+
+  const size_t size = scalar_size * scalar_count ;
+
+  if ( cuda_space_verify_modifiable("allocate") && size ) {
+
+    try {
+      Kokkos::Impl::cuda_device_synchronize();
+
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_CUDA_UVM)
+      CUDA_SAFE_CALL( cudaMallocManaged( (void**) &ptr, size, cudaMemAttachGlobal) );
+#else
+      CUDA_SAFE_CALL( cudaMalloc( (void**) &ptr, size) );
+#endif
+
+      Kokkos::Impl::cuda_device_synchronize();
+    }
+    catch( std::runtime_error & err ) {
+      std::ostringstream msg ;
+      msg << "Kokkos::Impl::CudaSpace::allocate( "
+          << label
+          << " , " << scalar_type.name()
+          << " , " << scalar_size
+          << " , " << scalar_count
+          << " ) FAILED memory allocation\n" 
+          << err.what();
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    } 
+
+    cuda_space_singleton().insert(
+      new CudaMemoryTrackingEntry( label , scalar_type , ptr , scalar_size , scalar_count ) );
+  }
+
+  return ptr ;
+}
+
+void CudaSpace::increment( const void * ptr )
+{
+  if ( cuda_space_verify_modifiable("increment") ) {
+    cuda_space_singleton().increment( ptr );
+  }
+}
+
+void CudaSpace::decrement( const void * ptr )
+{
+  if ( cuda_space_verify_modifiable("decrement") ) {
+    cuda_space_singleton().decrement( ptr );
+  }
+}
+
+void CudaSpace::print_memory_view( std::ostream & o )
+{
+  cuda_space_singleton().print( o , std::string("  ") );
+}
+
+//----------------------------------------------------------------------------
+
+std::string CudaSpace::query_label( const void * p )
+{
+  const Impl::MemoryTrackingEntry * entry =
+    cuda_space_singleton().query( p );
+
+  return entry ? entry->label : std::string("ERROR NOT FOUND");
+}
+
+void CudaSpace::access_error()
+{
+  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
+
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+void CudaSpace::access_error( const void * const ptr )
+{
+  std::ostringstream msg ;
+  msg << "Kokkos::CudaSpace::access_error:" ;
+  msg << " attempt to access Cuda-data labeled(" ;
+  msg << query_label( ptr ) ;
+  msg << ") from non-Cuda execution" ;
+  Kokkos::Impl::throw_runtime_exception( msg.str() );
+}
+
+/*--------------------------------------------------------------------------*/
+
+} // namespace Kokkos
+
+#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
+
+namespace Kokkos {
+namespace Impl {
+
+::cudaTextureObject_t
+cuda_texture_object_attach(
+  const cudaChannelFormatDesc & desc ,
+  const void * const            ptr )
+{
+  if ( 0 == ptr || ! cuda_space_verify_modifiable("texture_object_attach") ) return 0 ;
+
+  const unsigned max_count = 1 << 28 ;
+
+  CudaMemoryTrackingEntry * entry =
+    dynamic_cast<CudaMemoryTrackingEntry *>( cuda_space_singleton().query( ptr ) );
+
+  const bool ok_found  = 0 != entry ;
+  const bool ok_ptr    = ok_found && ptr == entry->ptr_alloc ;
+  const bool ok_count  = ok_found && entry->count < max_count ;
+
+  if ( ok_found && ok_ptr && ok_count ) {
+
+    // Can only create texture object on device architure 3.0 or better
+
+    if ( 0 == entry->tex_obj && 300 <= Cuda::device_arch() ) {
+
+      struct cudaResourceDesc resDesc ;
+      struct cudaTextureDesc  texDesc ;
+
+      memset( & resDesc , 0 , sizeof(resDesc) );
+      memset( & texDesc , 0 , sizeof(texDesc) );
+
+      resDesc.resType                = cudaResourceTypeLinear ;
+      resDesc.res.linear.desc        = desc ;
+      resDesc.res.linear.sizeInBytes = entry->size * entry->count ;
+      resDesc.res.linear.devPtr      = entry->ptr_alloc ;
+
+      cudaCreateTextureObject( & entry->tex_obj, & resDesc, & texDesc, NULL);
+    }
+  }
+  else {
+    std::ostringstream msg ;
+    msg << "CudaSpace::texture_object_attach( " << ptr << " ) FAILED: " ;
+
+    if ( ! ok_found ) {
+      msg << "Not View allocated" ;
+    }
+    else if ( ! ok_ptr ) {
+      msg << "Not the originally allocated View \"" << entry->label << "\"" ;
+    }
+    else if ( ! ok_count ) {
+      msg << "Cuda texture object limit exceeded "
+          << max_count << " <= " << entry->count ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return entry->tex_obj ;
+}
+
+int cuda_texture_object_release(::cudaTextureObject_t tex) {
+  return 0;//cudaDestroyTextureObject(tex);
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+int cuda_texture_object_release(const void* tex) {
+  return 0;
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cu
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cu
@ -0,0 +1,665 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+/* Kokkos interfaces */
+
+#include <Kokkos_Cuda.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/* Standard 'C' libraries */
+#include <stdlib.h>
+
+/* Standard 'C++' libraries */
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+__device__ __constant__
+Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
+kokkos_impl_cuda_constant_memory_buffer ;
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+namespace {
+
+__global__
+void query_cuda_kernel_arch( int * d_arch )
+{
+#if defined( __CUDA_ARCH__ )
+  *d_arch = __CUDA_ARCH__ ;
+#else
+  *d_arch = 0 ;
+#endif
+}
+
+/** Query what compute capability is actually launched to the device: */
+int cuda_kernel_arch()
+{
+  int * d_arch = 0 ;
+  cudaMalloc( (void **) & d_arch , sizeof(int) );
+  query_cuda_kernel_arch<<<1,1>>>( d_arch );
+  int arch = 0 ;
+  cudaMemcpy( & arch , d_arch , sizeof(int) , cudaMemcpyDefault );
+  cudaFree( d_arch );
+  return arch ;
+}
+
+bool cuda_launch_blocking()
+{
+  const char * env = getenv("CUDA_LAUNCH_BLOCKING");
+
+  if (env == 0) return false;
+
+  return atoi(env);
+}
+
+}
+
+void cuda_device_synchronize()
+{
+  static const bool launch_blocking = cuda_launch_blocking();
+
+  if (!launch_blocking) {
+    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+  }
+}
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
+{
+  std::ostringstream out ;
+  out << name << " error: " << cudaGetErrorString(e);
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw_runtime_exception( out.str() );
+}
+
+//----------------------------------------------------------------------------
+// Some significant cuda device properties:
+//
+// cudaDeviceProp::name                : Text label for device
+// cudaDeviceProp::major               : Device major number
+// cudaDeviceProp::minor               : Device minor number
+// cudaDeviceProp::warpSize            : number of threads per warp
+// cudaDeviceProp::multiProcessorCount : number of multiprocessors
+// cudaDeviceProp::sharedMemPerBlock   : capacity of shared memory per block
+// cudaDeviceProp::totalConstMem       : capacity of constant memory
+// cudaDeviceProp::totalGlobalMem      : capacity of global memory
+// cudaDeviceProp::maxGridSize[3]      : maximum grid size
+
+//
+//  Section 4.4.2.4 of the CUDA Toolkit Reference Manual
+//
+// struct cudaDeviceProp {
+//   char name[256];
+//   size_t totalGlobalMem;
+//   size_t sharedMemPerBlock;
+//   int regsPerBlock;
+//   int warpSize;
+//   size_t memPitch;
+//   int maxThreadsPerBlock;
+//   int maxThreadsDim[3];
+//   int maxGridSize[3];
+//   size_t totalConstMem;
+//   int major;
+//   int minor;
+//   int clockRate;
+//   size_t textureAlignment;
+//   int deviceOverlap;
+//   int multiProcessorCount;
+//   int kernelExecTimeoutEnabled;
+//   int integrated;
+//   int canMapHostMemory;
+//   int computeMode;
+//   int concurrentKernels;
+//   int ECCEnabled;
+//   int pciBusID;
+//   int pciDeviceID;
+//   int tccDriver;
+//   int asyncEngineCount;
+//   int unifiedAddressing;
+//   int memoryClockRate;
+//   int memoryBusWidth;
+//   int l2CacheSize;
+//   int maxThreadsPerMultiProcessor;
+// };
+
+
+namespace {
+
+
+
+class CudaInternalDevices {
+public:
+  enum { MAXIMUM_DEVICE_COUNT = 8 };
+  struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
+  int                    m_cudaDevCount ;
+
+  CudaInternalDevices();
+
+  static const CudaInternalDevices & singleton();
+};
+
+CudaInternalDevices::CudaInternalDevices()
+{
+  // See 'cudaSetDeviceFlags' for host-device thread interaction
+  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
+
+  CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
+
+  for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
+    CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
+  }
+}
+
+const CudaInternalDevices & CudaInternalDevices::singleton()
+{
+  static CudaInternalDevices self ; return self ;
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+class CudaInternal {
+private:
+
+  CudaInternal( const CudaInternal & );
+  CudaInternal & operator = ( const CudaInternal & );
+
+public:
+
+  typedef Cuda::size_type size_type ;
+
+  int         m_cudaDev ;
+  int         m_cudaArch ;
+  unsigned    m_maxWarpCount ;
+  unsigned    m_maxBlock ;
+  unsigned    m_maxSharedWords ;
+  size_type   m_scratchSpaceCount ;
+  size_type   m_scratchFlagsCount ;
+  size_type   m_scratchUnifiedCount ;
+  size_type   m_scratchUnifiedSupported ;
+  size_type * m_scratchSpace ;
+  size_type * m_scratchFlags ;
+  size_type * m_scratchUnified ;
+
+  static CudaInternal & singleton();
+
+  int verify_is_initialized( const char * const label ) const ;
+
+  int is_initialized() const
+    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+
+  void initialize( int cuda_device_id );
+  void finalize();
+
+  void print_configuration( std::ostream & ) const ;
+
+  ~CudaInternal();
+
+  CudaInternal()
+    : m_cudaDev( -1 )
+    , m_cudaArch( -1 )
+    , m_maxWarpCount( 0 )
+    , m_maxBlock( 0 ) 
+    , m_maxSharedWords( 0 )
+    , m_scratchSpaceCount( 0 )
+    , m_scratchFlagsCount( 0 )
+    , m_scratchUnifiedCount( 0 )
+    , m_scratchUnifiedSupported( 0 )
+    , m_scratchSpace( 0 )
+    , m_scratchFlags( 0 )
+    , m_scratchUnified( 0 )
+    {}
+
+  size_type * scratch_space( const size_type size );
+  size_type * scratch_flags( const size_type size );
+  size_type * scratch_unified( const size_type size );
+};
+
+//----------------------------------------------------------------------------
+
+
+void CudaInternal::print_configuration( std::ostream & s ) const
+{
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+#if defined( KOKKOS_HAVE_CUDA )
+    s << "macro  KOKKOS_HAVE_CUDA      : defined" << std::endl ;
+#endif
+#if defined( CUDA_VERSION )
+    s << "macro  CUDA_VERSION          = " << CUDA_VERSION
+      << " = version " << CUDA_VERSION / 1000
+      << "." << ( CUDA_VERSION % 1000 ) / 10
+      << std::endl ;
+#endif
+
+  for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) {
+    s << "Kokkos::Cuda[ " << i << " ] "
+      << dev_info.m_cudaProp[i].name
+      << " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor
+      << ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) 
+      << ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
+    if ( m_cudaDev == i ) s << " : Selected" ;
+    s << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+CudaInternal::~CudaInternal()
+{
+  if ( m_scratchSpace ||
+       m_scratchFlags ||
+       m_scratchUnified ) {
+    std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
+              << std::endl ;
+    std::cerr.flush();
+  }
+
+  m_cudaDev             = -1 ;
+  m_cudaArch            = -1 ;
+  m_maxWarpCount        = 0 ;
+  m_maxBlock            = 0 ;
+  m_maxSharedWords      = 0 ;
+  m_scratchSpaceCount   = 0 ;
+  m_scratchFlagsCount   = 0 ;
+  m_scratchUnifiedCount = 0 ;
+  m_scratchUnifiedSupported = 0 ;
+  m_scratchSpace   = 0 ;
+  m_scratchFlags   = 0 ;
+  m_scratchUnified = 0 ;
+}
+
+int CudaInternal::verify_is_initialized( const char * const label ) const
+{
+  if ( m_cudaDev < 0 ) {
+    std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" << std::endl ;
+  }
+  return 0 <= m_cudaDev ;
+}
+
+CudaInternal & CudaInternal::singleton()
+{
+  static CudaInternal self ;
+  return self ;
+}
+
+void CudaInternal::initialize( int cuda_device_id )
+{
+  enum { WordSize = sizeof(size_type) };
+
+  if ( ! Cuda::host_mirror_device_type::is_initialized() ) {
+    const std::string msg("Cuda::initialize ERROR : Cuda::host_mirror_device_type is not initialized");
+    throw_runtime_exception( msg );
+  }
+
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
+
+  const bool ok_id   = 0 <= cuda_device_id &&
+                            cuda_device_id < dev_info.m_cudaDevCount ;
+
+  // Need device capability 2.0 or better
+
+  const bool ok_dev = ok_id &&
+    ( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
+      0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
+
+  if ( ok_init && ok_dev ) {
+
+    const struct cudaDeviceProp & cudaProp =
+      dev_info.m_cudaProp[ cuda_device_id ];
+
+    m_cudaDev = cuda_device_id ;
+
+    CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) );
+    CUDA_SAFE_CALL( cudaDeviceReset() );
+    Kokkos::Impl::cuda_device_synchronize();
+
+    // Query what compute capability architecture a kernel executes:
+    m_cudaArch = cuda_kernel_arch();
+
+    if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) {
+      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
+                << ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 )
+                << " on device with compute capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " , this will likely reduce potential performance."
+                << std::endl ;
+    }
+
+    //----------------------------------
+    // Maximum number of warps,
+    // at most one warp per thread in a warp for reduction.
+
+    // HCE 2012-February :
+    // Found bug in CUDA 4.1 that sometimes a kernel launch would fail
+    // if the thread count == 1024 and a functor is passed to the kernel.
+    // Copying the kernel to constant memory and then launching with
+    // thread count == 1024 would work fine.
+    //
+    // HCE 2012-October :
+    // All compute capabilities support at least 16 warps (512 threads).
+    // However, we have found that 8 warps typically gives better performance.
+
+    m_maxWarpCount = 8 ;
+
+    // m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
+
+    if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) {
+      m_maxWarpCount = Impl::CudaTraits::WarpSize ;
+    }
+
+    m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ;
+
+    //----------------------------------
+    // Maximum number of blocks:
+
+    m_maxBlock = m_cudaArch < 300 ? 65535 : cudaProp.maxGridSize[0] ;
+
+    //----------------------------------
+
+    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
+
+    if ( ! m_scratchUnifiedSupported ) {
+      std::cout << "Kokkos::Cuda device "
+                << cudaProp.name << " capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " does not support unified virtual address space"
+                << std::endl ;
+    }
+
+    //----------------------------------
+    // Multiblock reduction uses scratch flags for counters
+    // and scratch space for partial reduction values.
+    // Allocate some initial space.  This will grow as needed.
+
+    {
+      const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ;
+
+      (void) scratch_unified( 16 * sizeof(size_type) );
+      (void) scratch_flags( reduce_block_count * 2  * sizeof(size_type) );
+      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
+    }
+  }
+  else {
+
+    std::ostringstream msg ;
+    msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ;
+
+    if ( ! ok_init ) {
+      msg << " : Already initialized" ;
+    }
+    if ( ! ok_id ) {
+      msg << " : Device identifier out of range "
+          << "[0.." << dev_info.m_cudaDevCount << "]" ;
+    }
+    else if ( ! ok_dev ) {
+      msg << " : Device " ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
+      msg << "." ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
+      msg << " has insufficient capability, required 2.0 or better" ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  } 
+}
+
+//----------------------------------------------------------------------------
+
+typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ;
+enum { sizeScratchGrain = sizeof(ScratchGrain) };
+
+
+Cuda::size_type *
+CudaInternal::scratch_flags( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
+
+    Cuda::memory_space::decrement( m_scratchFlags );
+  
+    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    m_scratchFlags = (size_type *)
+      Cuda::memory_space::allocate(
+        std::string("InternalScratchFlags") ,
+        typeid( ScratchGrain ),
+        sizeof( ScratchGrain ),
+        m_scratchFlagsCount );
+
+    CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
+  }
+
+  return m_scratchFlags ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_space( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
+
+    Cuda::memory_space::decrement( m_scratchSpace );
+  
+    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    m_scratchSpace = (size_type *)
+      Cuda::memory_space::allocate(
+        std::string("InternalScratchSpace") ,
+        typeid( ScratchGrain ),
+        sizeof( ScratchGrain ),
+        m_scratchSpaceCount );
+  }
+
+  return m_scratchSpace ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_unified( const Cuda::size_type size )
+{
+
+  if ( verify_is_initialized("scratch_unified") && m_scratchUnifiedSupported ) {
+
+    const bool allocate   = m_scratchUnifiedCount * sizeScratchGrain < size ;
+    const bool deallocate = m_scratchUnified && ( 0 == size || allocate );
+
+    if ( allocate || deallocate ) {
+      Kokkos::Impl::cuda_device_synchronize();
+    }
+
+    if ( deallocate ) {
+
+      CUDA_SAFE_CALL( cudaFreeHost( m_scratchUnified ) );
+
+      m_scratchUnified = 0 ;
+      m_scratchUnifiedCount = 0 ;
+    }
+
+    if ( allocate ) {
+
+      m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+      CUDA_SAFE_CALL( cudaHostAlloc( (void **)( & m_scratchUnified ) ,
+                      m_scratchUnifiedCount * sizeScratchGrain ,
+                      cudaHostAllocDefault ) );
+    }
+  }
+
+  return m_scratchUnified ;
+}
+
+//----------------------------------------------------------------------------
+
+void CudaInternal::finalize()
+{
+  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
+
+    Cuda::memory_space::decrement( m_scratchSpace );
+    Cuda::memory_space::decrement( m_scratchFlags );
+    (void) scratch_unified( 0 );
+
+    m_cudaDev            = -1 ;
+    m_maxWarpCount       = 0 ;
+    m_maxBlock           = 0 ; 
+    m_maxSharedWords     = 0 ;
+    m_scratchSpaceCount  = 0 ;
+    m_scratchFlagsCount  = 0 ;
+    m_scratchSpace       = 0 ;
+    m_scratchFlags       = 0 ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+Cuda::size_type cuda_internal_maximum_warp_count()
+{ return CudaInternal::singleton().m_maxWarpCount ; }
+
+Cuda::size_type cuda_internal_maximum_grid_count()
+{ return CudaInternal::singleton().m_maxBlock ; }
+
+Cuda::size_type cuda_internal_maximum_shared_words()
+{ return CudaInternal::singleton().m_maxSharedWords ; }
+
+Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_space( size ); }
+
+Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_flags( size ); }
+
+Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_unified( size ); }
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+Cuda::size_type Cuda::detect_device_count()
+{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
+
+int Cuda::is_initialized()
+{ return Impl::CudaInternal::singleton().is_initialized(); }
+
+void Cuda::initialize( const Cuda::SelectDevice config )
+{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id ); }
+
+void Cuda::initialize( const int device ) 
+{ initialize( SelectDevice(device) ); }
+
+void Cuda::initialize( const int device , const int )
+{ initialize( device ); }
+
+std::vector<unsigned>
+Cuda::detect_device_arch()
+{
+  const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton();
+
+  std::vector<unsigned> output( s.m_cudaDevCount );
+
+  for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) {
+    output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ;
+  }
+
+  return output ;
+}
+
+Cuda::size_type Cuda::device_arch()
+{
+  const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ;
+
+  int dev_arch = 0 ;
+
+  if ( 0 <= dev_id ) {
+    const struct cudaDeviceProp & cudaProp =
+      Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ;
+
+    dev_arch = cudaProp.major * 100 + cudaProp.minor ;
+  }
+
+  return dev_arch ;
+}
+
+void Cuda::finalize()
+{ Impl::CudaInternal::singleton().finalize(); }
+
+void Cuda::print_configuration( std::ostream & s , const bool )
+{ Impl::CudaInternal::singleton().print_configuration( s ); }
+
+bool Cuda::sleep() { return false ; }
+
+bool Cuda::wake() { return true ; }
+
+void Cuda::fence()
+{ 
+  Kokkos::Impl::cuda_device_synchronize();
+}
+
+unsigned Cuda::team_max()
+{
+  return Impl::CudaInternal::singleton().m_maxWarpCount << Impl::CudaTraits::WarpIndexShift ;
+}
+
+unsigned Cuda::team_recommended()
+{
+  // Recommend 8 warps to hide latency and be a power of two
+  return unsigned(8) << Impl::CudaTraits::WarpIndexShift ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@ -0,0 +1,69 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_INTERNAL_HPP
+#define KOKKOS_CUDA_INTERNAL_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
+
+void cuda_device_synchronize();
+
+inline
+void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
+{
+  if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
+}
+
+
+
+}
+}
+
+#define CUDA_SAFE_CALL( call )  \
+	Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
+
+#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@ -0,0 +1,790 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_PARALLEL_HPP
+#define KOKKOS_CUDA_PARALLEL_HPP
+
+#include <iostream>
+#include <stdio.h>
+
+#if defined( __CUDACC__ )
+
+#include <utility>
+#include <Kokkos_Parallel.hpp>
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename Type >
+struct CudaJoinFunctor {
+  typedef Type value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    volatile const value_type & input )
+    { update += input ; }
+};
+
+class CudaTeamMember {
+private:
+
+  typedef Kokkos::Cuda                           execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  void                * m_team_reduce ;
+  scratch_memory_space  m_team_shared ;
+  int                   m_league_rank ;
+  int                   m_league_size ;
+
+public:
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared ; }
+
+  __device__ inline int league_rank() const { return m_league_rank ; }
+  __device__ inline int league_size() const { return m_league_size ; }
+  __device__ inline int team_rank() const { return threadIdx.x ; }
+  __device__ inline int team_size() const { return blockDim.x ; }
+
+  __device__ inline void team_barrier() const { __syncthreads(); }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  __device__ inline Type team_scan( const Type & value , Type * const global_accum ) const
+    {
+      Type * const base_data = (Type *) m_team_reduce ;
+
+      __syncthreads(); // Don't write in to shared data until all threads have entered this function
+
+      if ( 0 == threadIdx.x ) { base_data[0] = 0 ; }
+
+      base_data[ threadIdx.x + 1 ] = value ;
+
+      Impl::cuda_intra_block_reduce_scan<true>( Impl::CudaJoinFunctor<Type>() , base_data + 1 );
+
+      if ( global_accum ) {
+        if ( blockDim.x == threadIdx.x + 1 ) {
+          base_data[ blockDim.x ] = atomic_fetch_add( global_accum , base_data[ blockDim.x ] );
+        }
+        __syncthreads(); // Wait for atomic
+        base_data[ threadIdx.x ] += base_data[ blockDim.x ] ;
+      }
+
+      return base_data[ threadIdx.x ];
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  __device__ inline Type team_scan( const Type & value ) const
+    { return this->template team_scan<Type>( value , 0 ); }
+
+  //----------------------------------------
+  // Private for the driver
+
+  __device__ inline
+  CudaTeamMember( void * shared
+                , const int shared_begin
+                , const int shared_size
+                , const int arg_league_rank
+                , const int arg_league_size )
+    : m_team_reduce( shared )
+    , m_team_shared( ((char *)shared) + shared_begin , shared_size )
+    , m_league_rank( arg_league_rank ) 
+    , m_league_size( arg_league_size ) 
+    {}
+
+#else
+
+  const execution_space::scratch_memory_space & team_shmem() const ;
+
+  int league_rank() const ;
+  int league_size() const ;
+  int team_rank() const ;
+  int team_size() const ;
+
+  void team_barrier() const ;
+
+  template< typename Type >
+  Type team_scan( const Type & value , Type * const global_accum ) const ;
+
+  template< typename Type >
+  Type team_scan( const Type & value ) const ;
+
+  //----------------------------------------
+  // Private for the driver
+
+  CudaTeamMember( void * shared
+                , const int shared_begin
+                , const int shared_end
+                , const int arg_league_rank
+                , const int arg_league_size );
+
+#endif /* #if ! defined( __CUDA_ARCH__ ) */
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+
+template< class WorkArgTag >
+class TeamPolicy< Kokkos::Cuda , WorkArgTag > {
+private:
+
+  enum { MAX_WARP = 8 };
+
+  const int m_league_size ;
+  const int m_team_size ;
+
+public:
+
+  typedef Impl::ExecutionPolicyTag   kokkos_tag ;      ///< Concept tag
+  typedef Kokkos::Cuda               execution_space ; ///< Execution space
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size , int team_size_request )
+    : m_league_size( league_size )
+    , m_team_size( std::min( team_size_request , int( MAX_WARP * Impl::CudaTraits::WarpSize ) ) )
+    { }
+
+  TeamPolicy( int league_size , int team_size_request )
+    : m_league_size( league_size )
+    , m_team_size( std::min( team_size_request , int( MAX_WARP * Impl::CudaTraits::WarpSize ) ) )
+    { }
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & functor )
+    {
+      int n = MAX_WARP * Impl::CudaTraits::WarpSize ;
+
+      for ( ; n ; n >>= 1 ) {
+        const int shmem_size =
+          /* for global reduce */ Impl::cuda_single_inter_block_reduce_scan_shmem<false>( functor , n )
+          /* for team   reduce */ + ( n + 2 ) * sizeof(double)
+          /* for team   shared */ + Impl::FunctorTeamShmemSize< FunctorType >::value( functor , n );
+
+        if ( shmem_size < Impl::CudaTraits::SharedMemoryCapacity ) break ;
+      }
+
+      return n ;
+    }
+
+  typedef Kokkos::Impl::CudaTeamMember member_type ;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Kokkos::Cuda , void , IntType , P >
+                 , Kokkos::Cuda >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Kokkos::Cuda , void , IntType , P > Policy ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;  
+
+  ParallelFor();
+  ParallelFor & operator = ( const ParallelFor & );
+
+public:
+
+  inline
+  __device__
+  void operator()(void) const
+  {
+    const typename Policy::member_type work_stride = blockDim.x * gridDim.x ;
+    const typename Policy::member_type work_end    = m_policy.end();
+
+    for ( typename Policy::member_type
+            iwork =  m_policy.begin() + threadIdx.x + blockDim.x * blockIdx.x ;
+            iwork <  work_end ;
+            iwork += work_stride ) {
+      m_functor( iwork );
+    }
+  }
+
+  ParallelFor( const FunctorType  & functor ,
+               const Policy       & policy )
+    : m_functor( functor )
+    , m_policy(  policy )
+    {
+      const dim3 block( CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1, 1);
+      const dim3 grid( std::min( ( int( policy.end() - policy.begin() ) + block.x - 1 ) / block.x
+                               , cuda_internal_maximum_grid_count() )
+                     , 1 , 1 );
+
+      CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
+    }
+};
+
+template< class FunctorType >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Kokkos::Cuda , void > , Kokkos::Cuda >
+{
+public:
+  typedef Kokkos::TeamPolicy< Kokkos::Cuda , void >   Policy ;
+  typedef typename Policy::member_type                team_member ;
+  typedef Cuda::size_type                             size_type ;
+
+  // Algorithmic constraints: blockDim.x is a power of two AND blockDim.y == blockDim.z == 1
+  // shared memory utilization:
+  //
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const FunctorType m_functor ;
+  size_type         m_shmem_begin ;
+  size_type         m_shmem_size ;
+  size_type         m_league_size ;
+
+  __device__ inline
+  void operator()(void) const
+  {
+    // Iterate this block through the league
+    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+
+      const team_member member( kokkos_impl_cuda_shared_memory<void>()
+                              , m_shmem_begin
+                              , m_shmem_size
+                              , league_rank
+                              , m_league_size );
+
+      m_functor( member );
+    }
+  }
+
+
+  ParallelFor( const FunctorType  & functor 
+             , const Policy       & policy 
+             )
+  : m_functor( functor )
+  , m_shmem_begin( sizeof(double) * ( policy.team_size() + 2 ) )
+  , m_shmem_size( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+  , m_league_size( policy.league_size() )
+  {
+    // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+
+    const int shmem_size_total = m_shmem_begin + m_shmem_size ;
+
+    if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
+    }
+
+    const dim3 grid( std::min( int(policy.league_size()) , int(cuda_internal_maximum_grid_count()) ) , 1 , 1 );
+    const dim3 block( policy.team_size() , 1 , 1 );
+
+    CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelReduce< FunctorType 
+                    , Kokkos::RangePolicy< Kokkos::Cuda , void , IntType , P >
+                    , Kokkos::Cuda
+                    >
+{
+public:
+  typedef ReduceAdapter< FunctorType >        Reduce ;
+  typedef typename Reduce::pointer_type       pointer_type ;
+  typedef typename Reduce::reference_type     reference_type ;
+  typedef Kokkos::RangePolicy< Kokkos::Cuda , void , IntType , P > Policy ;
+  typedef Cuda::size_type                     size_type ;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+  size_type *       m_unified_space ;
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      unsigned n = CudaTraits::WarpSize * 8 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  __device__ inline
+  void operator()(void) const
+  {
+    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
+      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
+
+    {
+      reference_type value =
+        Reduce::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.x * word_count.value );
+
+      // Number of blocks is bounded so that the reduction can be limited to two passes.
+      // Each thread block is given an approximately equal amount of work to perform.
+      // Accumulate the values for this block.
+      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+      const Policy range( m_policy , blockIdx.x , gridDim.x );
+
+      for ( typename Policy::member_type iwork = range.begin() + threadIdx.x , iwork_end = range.end() ;
+            iwork < iwork_end ; iwork += blockDim.x ) {
+        m_functor( iwork , value );
+      }
+    }
+
+    // Reduce with final value at blockDim.x - 1 location.
+    if ( cuda_single_inter_block_reduce_scan<false>(
+           m_functor , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.x - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.x == 0 ) { Reduce::final( m_functor , shared ); }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.x ; i < word_count.value ; i += blockDim.x ) { global[i] = shared[i]; }
+    }
+  }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & functor 
+                , const Policy       & policy 
+                , const HostViewType & result
+                )
+  : m_functor( functor )
+  , m_policy(  policy )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  {
+    const int block_size  = local_block_size( functor );
+    const int block_count = std::min( int(block_size)
+                                    , ( int(policy.end() - policy.begin()) + block_size - 1 ) / block_size
+                                    );
+
+    m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * block_count );
+    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+    m_unified_space = cuda_internal_scratch_unified( Reduce::value_size( functor ) );
+
+    const dim3 grid( block_count , 1 , 1 );
+    const dim3 block( block_size , 1 , 1 );
+    const int shmem = cuda_single_inter_block_reduce_scan_shmem<false>( m_functor , block.x );
+
+    CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
+
+    Cuda::fence();
+
+    if ( result.ptr_on_device() ) {
+      if ( m_unified_space ) {
+        const int count = Reduce::value_count( m_functor );
+        for ( int i = 0 ; i < count ; ++i ) { result.ptr_on_device()[i] = pointer_type(m_unified_space)[i] ; }
+      }
+      else {
+        const int size = Reduce::value_size( m_functor );
+        DeepCopy<HostSpace,CudaSpace>( result.ptr_on_device() , m_scratch_space , size );
+      }
+    }
+  }
+};
+
+template< class FunctorType >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Kokkos::Cuda , void > , Kokkos::Cuda >
+{
+public:
+  typedef Kokkos::TeamPolicy< Kokkos::Cuda , void >   Policy ;
+  typedef typename Policy::member_type                team_member ;
+  typedef ReduceAdapter< FunctorType >                Reduce ;
+  typedef typename Reduce::pointer_type               pointer_type ;
+  typedef typename Reduce::reference_type             reference_type ;
+  typedef Cuda::size_type                             size_type ;
+
+  // Algorithmic constraints: blockDim.x is a power of two AND blockDim.y == blockDim.z == 1
+  // shared memory utilization:
+  //
+  //  [ global reduce space ]
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const FunctorType m_functor ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+  size_type *       m_unified_space ;
+  size_type         m_team_begin ;
+  size_type         m_shmem_begin ;
+  size_type         m_shmem_size ;
+  size_type         m_league_size ;
+
+
+  __device__ inline
+  void operator()(void) const
+  {
+    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
+      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
+
+    reference_type value =
+      Reduce::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.x * word_count.value );
+
+    // Iterate this block through the league
+    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += blockDim.x ) {
+
+      const team_member member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
+                              , m_shmem_begin
+                              , m_shmem_size
+                              , league_rank
+                              , m_league_size );
+
+      m_functor( member , value );
+    }
+
+    // Reduce with final value at blockDim.x - 1 location.
+    if ( cuda_single_inter_block_reduce_scan<false>(
+           m_functor , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.x - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.x == 0 ) { Reduce::final( m_functor , shared ); }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.x ; i < word_count.value ; i += blockDim.x ) { global[i] = shared[i]; }
+    }
+  }
+
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & functor 
+                , const Policy       & policy 
+                , const HostViewType & result
+                )
+  : m_functor( functor )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  , m_team_begin( cuda_single_inter_block_reduce_scan_shmem<false>( functor , policy.team_size() ) )
+  , m_shmem_begin( sizeof(double) * ( policy.team_size() + 2 ) )
+  , m_shmem_size( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+  , m_league_size( policy.league_size() )
+  {
+    // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+
+    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
+    const int not_power_of_two = 0 != ( policy.team_size() & ( policy.team_size() - 1 ) );
+
+    if ( not_power_of_two ||  CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
+    }
+
+    const int block_count = std::min( policy.league_size() , policy.team_size() );
+
+    m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * block_count );
+    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+    m_unified_space = cuda_internal_scratch_unified( Reduce::value_size( functor ) );
+
+    const dim3 grid( block_count , 1 , 1 );
+    const dim3 block( policy.team_size() , 1 , 1 );
+
+    CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
+
+    Cuda::fence();
+
+    if ( result.ptr_on_device() ) {
+      if ( m_unified_space ) {
+        const int count = Reduce::value_count( m_functor );
+        for ( int i = 0 ; i < count ; ++i ) { result.ptr_on_device()[i] = pointer_type(m_unified_space)[i] ; }
+      }
+      else {
+        const int size = Reduce::value_size( m_functor );
+        DeepCopy<HostSpace,CudaSpace>( result.ptr_on_device() , m_scratch_space , size );
+      }
+    }
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Kokkos::Cuda , void , IntType , P >
+                  , Kokkos::Cuda
+                  >
+{
+public:
+  typedef ReduceAdapter< FunctorType >        Reduce ;
+  typedef typename Reduce::pointer_type       pointer_type ;
+  typedef typename Reduce::reference_type     reference_type ;
+  typedef Kokkos::RangePolicy< Kokkos::Cuda , void , IntType , P > Policy ;
+  typedef Cuda::size_type                     size_type ;
+
+  // Algorithmic constraints:
+  //  (a) blockDim.x is a power of two
+  //  (b) blockDim.y == blockDim.z == 1
+  //  (c) gridDim.x  <= blockDim.x * blockDim.x
+  //  (d) gridDim.y  == gridDim.z == 1
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      // blockDim.x must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps)
+      // gridDim.x <= blockDim.x * blockDim.x
+      //
+      // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing
+
+      unsigned n = CudaTraits::WarpSize * 4 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+        size_type   m_final ;
+  
+  //----------------------------------------
+
+  __device__ inline
+  void initial(void) const
+  {
+    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
+      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
+
+    size_type * const shared_value = kokkos_impl_cuda_shared_memory<size_type>() + word_count.value * threadIdx.x ;
+
+    Reduce::init( m_functor , shared_value );
+
+    // Number of blocks is bounded so that the reduction can be limited to two passes.
+    // Each thread block is given an approximately equal amount of work to perform.
+    // Accumulate the values for this block.
+    // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+    const Policy range( m_policy , blockIdx.x , gridDim.x );
+
+    for ( typename Policy::member_type iwork = range.begin() + threadIdx.x , iwork_end = range.end() ;
+          iwork < iwork_end ; iwork += blockDim.x ) {
+      m_functor( iwork , Reduce::reference( shared_value ) , false );
+    }
+
+    // Reduce and scan, writing out scan of blocks' totals and block-groups' totals.
+    // Blocks' scan values are written to 'blockIdx.x' location.
+    // Block-groups' scan values are at: i = ( j * blockDim.x - 1 ) for i < gridDim.x
+    cuda_single_inter_block_reduce_scan<true>( m_functor , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags );
+  }
+
+  //----------------------------------------
+
+  __device__ inline
+  void final(void) const
+  {
+    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
+      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
+
+    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... }
+    size_type * const shared_data   = kokkos_impl_cuda_shared_memory<size_type>();
+    size_type * const shared_prefix = shared_data + word_count.value * threadIdx.x ;
+    size_type * const shared_accum  = shared_data + word_count.value * ( blockDim.x + 1 );
+
+    // Starting value for this thread block is the previous block's total.
+    if ( blockIdx.x ) {
+      size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 );
+      for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; }
+    }
+    else if ( 0 == threadIdx.x ) {
+      Reduce::init( m_functor , shared_accum );
+    }
+
+    const Policy range( m_policy , blockIdx.x , gridDim.x );
+
+    for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.x ) {
+
+      const typename Policy::member_type iwork = iwork_base + threadIdx.x ;
+
+      __syncthreads(); // Don't overwrite previous iteration values until they are used
+
+      Reduce::init( m_functor , shared_prefix + word_count.value );
+
+      // Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block
+      for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) {
+        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
+
+      // Call functor to accumulate inclusive scan value for this work item
+      if ( iwork < range.end() ) { m_functor( iwork , Reduce::reference( shared_prefix + word_count.value ) , false ); }
+
+      // Scan block values into locations shared_data[1..blockDim.x]
+      cuda_intra_block_reduce_scan<true>( m_functor , Reduce::pointer_type(shared_data+word_count.value) );
+
+      {
+        size_type * const block_total = shared_data + word_count.value * blockDim.x ;
+        for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; }
+      }
+
+      // Call functor with exclusive scan value
+      if ( iwork < range.end() ) { m_functor( iwork , Reduce::reference( shared_prefix ) , true ); }
+    }
+  }
+
+  //----------------------------------------
+
+  __device__ inline
+  void operator()(void) const
+  {
+    if ( ! m_final ) {
+      initial();
+    }
+    else {
+      final();
+    }
+  }
+
+  ParallelScan( const FunctorType  & functor ,
+                const Policy       & policy )
+  : m_functor( functor )
+  , m_policy( policy )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_final( false )
+  {
+    enum { GridMaxComputeCapability_2x = 0x0ffff };
+
+    const int block_size = local_block_size( functor );
+
+    const int grid_max = ( block_size * block_size ) < GridMaxComputeCapability_2x ?
+                         ( block_size * block_size ) : GridMaxComputeCapability_2x ;
+
+    // At most 'max_grid' blocks:
+    const int nwork    = policy.end() - policy.begin();
+    const int max_grid = std::min( int(grid_max) , int(( nwork + block_size - 1 ) / block_size ));
+
+    // How much work per block:
+    const int work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
+
+    // How many block are really needed for this much work:
+    const dim3 grid( ( nwork + work_per_block - 1 ) / work_per_block , 1 , 1 );
+    const dim3 block( block_size , 1 , 1 );
+    const int shmem = Reduce::value_size( functor ) * ( block_size + 2 );
+
+    m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * grid.x );
+    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );
+
+    m_final = false ;
+    CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+
+    m_final = true ;
+    CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+  }
+
+  void wait() const { Cuda::fence(); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -0,0 +1,260 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
+#define KOKKOS_CUDA_REDUCESCAN_HPP
+
+#if defined( __CUDACC__ )
+
+#include <utility>
+
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize shared memory and minimize L1 cache:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
+// For 2.0 capability: 48 KB shared and 16 KB L1
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) blockDim.x is a power of two
+ *   (b) blockDim.x <= 512
+ *   (c) blockDim.y == blockDim.z == 1
+ */
+template< bool DoScan , class FunctorType >
+__device__
+void cuda_intra_block_reduce_scan( const FunctorType & functor ,
+                                   const typename ReduceAdapter< FunctorType >::pointer_type base_data )
+{
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+
+  const unsigned value_count   = Reduce::value_count( functor );
+  const unsigned BlockSizeMask = blockDim.x - 1 ;
+
+  // Must have power of two thread count
+
+  if ( BlockSizeMask & blockDim.x ) { cuda_abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
+
+#define BLOCK_REDUCE_STEP( R , TD , S )  \
+  if ( ! ( R & ((1<<(S+1))-1) ) ) { Reduce::join( functor , TD , (TD - (value_count<<S)) ); }
+
+#define BLOCK_SCAN_STEP( TD , N , S )  \
+  if ( N == (1<<S) ) { Reduce::join( functor , TD , (TD - (value_count<<S))); }
+
+  const unsigned     rtid_intra = threadIdx.x ^ BlockSizeMask ;
+  const pointer_type tdata_intra = base_data + value_count * threadIdx.x ;
+
+  { // Intra-warp reduction:
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
+  }
+
+  __syncthreads(); // Wait for all warps to reduce
+
+  { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
+    const unsigned rtid_inter = ( threadIdx.x ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
+
+    if ( rtid_inter < blockDim.x ) {
+
+      const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
+
+      if ( (1<<5) < BlockSizeMask ) {                        BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
+      if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
+      if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
+      if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
+
+      if ( DoScan ) {
+
+        int n = ( rtid_inter &  32 ) ?  32 : (
+                ( rtid_inter &  64 ) ?  64 : (
+                ( rtid_inter & 128 ) ? 128 : (
+                ( rtid_inter & 256 ) ? 256 : 0 )));
+
+        if ( ! ( rtid_inter + n < blockDim.x ) ) n = 0 ;
+
+        BLOCK_SCAN_STEP(tdata_inter,n,8)
+        BLOCK_SCAN_STEP(tdata_inter,n,7)
+        BLOCK_SCAN_STEP(tdata_inter,n,6)
+        BLOCK_SCAN_STEP(tdata_inter,n,5)
+      }
+    }
+  }
+
+  __syncthreads(); // Wait for inter-warp reduce-scan to complete
+
+  if ( DoScan ) {
+    int n = ( rtid_intra &  1 ) ?  1 : (
+            ( rtid_intra &  2 ) ?  2 : (
+            ( rtid_intra &  4 ) ?  4 : (
+            ( rtid_intra &  8 ) ?  8 : (
+            ( rtid_intra & 16 ) ? 16 : 0 ))));
+
+    if ( ! ( rtid_intra + n < blockDim.x ) ) n = 0 ;
+
+    BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0)
+  }
+
+#undef BLOCK_SCAN_STEP
+#undef BLOCK_REDUCE_STEP
+}
+
+//----------------------------------------------------------------------------
+/**\brief  Input value-per-thread starting at 'shared_data'.
+ *         Reduction value at last thread's location.
+ *
+ *  If 'DoScan' then write blocks' scan values and block-groups' scan values.
+ *
+ *  Global reduce result is in the last threads' 'shared_data' location.
+ */
+template< bool DoScan , class FunctorType >
+__device__
+bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
+                                          const Cuda::size_type   block_id ,
+                                          const Cuda::size_type   block_count ,
+                                          Cuda::size_type * const shared_data ,
+                                          Cuda::size_type * const global_data ,
+                                          Cuda::size_type * const global_flags )
+{
+  typedef Cuda::size_type                  size_type ;
+  typedef ReduceAdapter< FunctorType >     Reduce ;
+  typedef typename Reduce::pointer_type    pointer_type ;
+  typedef typename Reduce::reference_type  reference_type ;
+
+  const unsigned BlockSizeMask  = blockDim.x - 1 ;
+  const unsigned BlockSizeShift = power_of_two_if_valid( blockDim.x );
+
+  // Must have power of two thread count
+  if ( BlockSizeMask & blockDim.x ) { cuda_abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); }
+
+  const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
+    word_count( Reduce::value_size( functor ) / sizeof(size_type) );
+
+  // Reduce the accumulation for the entire block.
+  cuda_intra_block_reduce_scan<false>( functor , pointer_type(shared_data) );
+
+  {
+    // Write accumulation total to global scratch space.
+    // Accumulation total is the last thread's data.
+    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
+    size_type * const global = global_data + word_count.value * block_id ;
+
+    for ( size_type i = threadIdx.x ; i < word_count.value ; i += blockDim.x ) { global[i] = shared[i] ; }
+  }
+
+  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
+  // If this block is not the last block to contribute to this group then the block is done.
+  const bool is_last_block =
+    ! __syncthreads_or( threadIdx.x ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
+
+  if ( is_last_block ) {
+
+    const size_type b = ( long(block_count) * long(threadIdx.x) ) >> BlockSizeShift ;
+    const size_type e = ( long(block_count) * long( threadIdx.x + 1 ) ) >> BlockSizeShift ;
+
+    {
+      void * const shared_ptr = shared_data + word_count.value * threadIdx.x ;
+      reference_type shared_value = Reduce::init( functor , shared_ptr );
+
+      for ( size_type i = b ; i < e ; ++i ) {
+        Reduce::join( functor , shared_ptr , global_data + word_count.value * i );
+      }
+    }
+
+    cuda_intra_block_reduce_scan<DoScan>( functor , pointer_type(shared_data) );
+
+    if ( DoScan ) {
+
+      size_type * const shared_value = shared_data + word_count.value * ( threadIdx.x ? threadIdx.x - 1 : blockDim.x );
+
+      if ( ! threadIdx.x ) { Reduce::init( functor , shared_value ); }
+
+      // Join previous inclusive scan value to each member
+      for ( size_type i = b ; i < e ; ++i ) {
+        size_type * const global_value = global_data + word_count.value * i ;
+        Reduce::join( functor , shared_value , global_value );
+        Reduce::copy( functor , global_value , shared_value );
+      }
+    }
+  }
+
+  return is_last_block ;
+}
+
+// Size in bytes required for inter block reduce or scan
+template< bool DoScan , class FunctorType >
+inline
+unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
+{
+  return ( BlockSize + 2 ) * ReduceAdapter< FunctorType >::value_size( functor );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( __CUDACC__ ) */
+#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@ -0,0 +1,335 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Cuda.hpp>
+
+namespace Kokkos {
+
+
+// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
+// or other GPUs.  We provide a generic definition (which is trivial
+// and doesn't do what it claims to do) because we don't actually use
+// this function unless we are on a suitable GPU, with a suitable
+// Scalar type.  (For example, in the mat-vec, the "ThreadsPerRow"
+// internal parameter depends both on the Device and the Scalar type,
+// and it controls whether shfl_down() gets called.)
+template<typename Scalar>
+KOKKOS_INLINE_FUNCTION
+Scalar shfl_down(const Scalar &val, const int& delta, const int& width){
+  return val;
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+unsigned int shfl_down<unsigned int>(const unsigned int &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    unsigned int tmp1 = val;
+    int tmp = *reinterpret_cast<int*>(&tmp1);
+    tmp = __shfl_down(tmp,delta,width);
+    return *reinterpret_cast<unsigned int*>(&tmp);
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+int shfl_down<int>(const int &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    return __shfl_down(val,delta,width);
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+float shfl_down<float>(const float &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    return __shfl_down(val,delta,width);
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+double shfl_down<double>(const double &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    int lo = __double2loint(val);
+    int hi = __double2hiint(val);
+    lo = __shfl_down(lo,delta,width);
+    hi = __shfl_down(hi,delta,width);
+    return __hiloint2double(hi,lo);
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+long int shfl_down<long int>(const long int &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+    int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+    lo = __shfl_down(lo,delta,width);
+    hi = __shfl_down(hi,delta,width);
+    const double tmp = __hiloint2double(hi,lo);
+    return *(reinterpret_cast<const long int*>(&tmp));
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+unsigned long shfl_down<unsigned long>(const unsigned long &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+    int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+    lo = __shfl_down(lo,delta,width);
+    hi = __shfl_down(hi,delta,width);
+    const double tmp = __hiloint2double(hi,lo);
+    return *(reinterpret_cast<const unsigned long*>(&tmp));
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<int N>
+struct Vectorization<Cuda,N> {
+  typedef Kokkos::TeamPolicy< Cuda >         team_policy ;
+  typedef typename team_policy::member_type  team_member ;
+  enum {increment = N};
+
+#ifdef __CUDA_ARCH__
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int begin() { return threadIdx.x%N;}
+#else
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int begin() { return 0;}
+#endif
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int thread_rank(const team_member &dev) {
+    return dev.team_rank()/increment;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int team_rank(const team_member &dev) {
+    return dev.team_rank()/increment;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int team_size(const team_member &dev) {
+    return dev.team_size()/increment;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int global_thread_rank(const team_member &dev) {
+    return (dev.league_rank()*dev.team_size()+dev.team_rank())/increment;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static bool is_lane_0(const team_member &dev) {
+    return (dev.team_rank()%increment)==0;
+  }
+
+  template<class Scalar>
+  KOKKOS_INLINE_FUNCTION
+  static Scalar reduce(const Scalar& val) {
+    #ifdef __CUDA_ARCH__
+    __shared__ Scalar result[256];
+    Scalar myresult;
+    for(int k=0;k<blockDim.x;k+=256) {
+      const int tid = threadIdx.x - k;
+      if(tid > 0 && tid<256) {
+        result[tid] = val;
+        if ( (N > 1) && (tid%2==0) )
+          result[tid] += result[tid+1];
+        if ( (N > 2) && (tid%4==0) )
+          result[tid] += result[tid+2];
+        if ( (N > 4) && (tid%8==0) )
+          result[tid] += result[tid+4];
+        if ( (N > 8) && (tid%16==0) )
+          result[tid] += result[tid+8];
+        if ( (N > 16) && (tid%32==0) )
+          result[tid] += result[tid+16];
+        myresult = result[tid];
+      }
+      if(blockDim.x>256)
+        __syncthreads();
+    }
+    return myresult;
+    #else
+    return val;
+    #endif
+  }
+
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+  KOKKOS_INLINE_FUNCTION
+  static int reduce(const int& val) {
+    int result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static unsigned int reduce(const unsigned int& val) {
+    unsigned int result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static long int reduce(const long int& val) {
+    long int result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static unsigned long int reduce(const unsigned long int& val) {
+    unsigned long int result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static float reduce(const float& val) {
+    float result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static double reduce(const double& val) {
+    double result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+  #endif
+#endif
+
+};
+}
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -0,0 +1,410 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_VIEW_HPP
+#define KOKKOS_CUDA_VIEW_HPP
+
+#include <cstring>
+
+#if defined( __CUDACC__ )
+#include <cuda_runtime.h>
+#endif
+
+#include <Kokkos_View.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_CudaSpace.hpp>
+#include <Kokkos_CudaTypes.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct AssertShapeBoundsAbort< CudaSpace >
+{
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t /* rank */ ,
+                     const size_t /* n0 */ , const size_t /* n1 */ ,
+                     const size_t /* n2 */ , const size_t /* n3 */ ,
+                     const size_t /* n4 */ , const size_t /* n5 */ ,
+                     const size_t /* n6 */ , const size_t /* n7 */ ,
+
+                     const size_t /* arg_rank */ ,
+                     const size_t /* i0 */ , const size_t /* i1 */ ,
+                     const size_t /* i2 */ , const size_t /* i3 */ ,
+                     const size_t /* i4 */ , const size_t /* i5 */ ,
+                     const size_t /* i6 */ , const size_t /* i7 */ )
+    {
+      Kokkos::cuda_abort("Kokkos::View array bounds violation");
+    }
+};
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
+// to be an 'unsigned long long'.  This chould change with
+// future version of Cuda and this typedef would have to
+// change accordingly.
+
+#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
+
+typedef enable_if<
+  sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
+  ::cudaTextureObject_t >::type cuda_texture_object_type ;
+
+cuda_texture_object_type
+cuda_texture_object_attach(
+  const cudaChannelFormatDesc & ,
+  const void * const );
+
+int cuda_texture_object_release(
+    cuda_texture_object_type obj
+    );
+
+int cuda_texture_object_release(
+    const void * const
+    );
+
+template< typename TextureType >
+inline
+cuda_texture_object_type
+cuda_texture_object_attach( const void * const base_view_ptr )
+{
+  return cuda_texture_object_attach( cudaCreateChannelDesc<TextureType>() , base_view_ptr );
+}
+
+#else
+
+typedef const void * cuda_texture_object_type ;
+
+template< typename TextureType >
+inline
+cuda_texture_object_type
+cuda_texture_object_attach( const void * const )
+{ return 0 ; }
+
+int cuda_texture_object_release(
+    const void * const
+    );
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
+// Via reinterpret_case this can be used to support all scalar types of those sizes.
+// Any other scalar type falls back to either normal reads out of global memory,
+// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
+
+template< typename T, size_t size = sizeof(T) >
+struct alias_type {
+  typedef void type;
+};
+
+template< typename T >
+struct alias_type<T,4> {
+  typedef int type;
+};
+
+template< typename T >
+struct alias_type<T,8> {
+  typedef int2 type;
+};
+
+template< typename T >
+struct alias_type<T,16> {
+  typedef int4 type;
+};
+
+template< typename ValueType, typename AliasType = typename alias_type<ValueType>::type >
+struct CudaTextureFetch {
+  private:
+
+    cuda_texture_object_type  obj ;
+
+    int* ref_count ;
+  public:
+
+    const ValueType * ptr ;
+
+    KOKKOS_INLINE_FUNCTION
+    CudaTextureFetch() : obj( 0 ) , ref_count(0), ptr( 0 ) {}
+
+    KOKKOS_INLINE_FUNCTION
+    ~CudaTextureFetch() {
+#ifndef __CUDA_ARCH__
+      if(ptr!=NULL) {
+        //printf("Release D: %p %p %i\n",this,ptr,ref_count[0]);
+        int count = Kokkos::atomic_fetch_add(ref_count,-1);
+        if(count == 1) {
+          cuda_texture_object_release(obj);
+          delete [] ref_count;
+        }
+      }
+#endif
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    CudaTextureFetch( const CudaTextureFetch & rhs ) {
+#ifndef __CUDA_ARCH__
+      if(rhs.ptr != NULL) {
+        obj = rhs.obj;
+        ptr = rhs.ptr;
+        ref_count = rhs.ref_count;
+        Kokkos::atomic_fetch_add(ref_count,1);
+        //printf("Attach C: %p %p %i\n",this,ptr,ref_count[0]);
+      } else {
+        obj = 0;
+        ref_count = NULL;
+        ptr = NULL;
+      }
+#else
+      obj = rhs.obj;
+      ref_count = rhs.ref_count;
+      ptr = rhs.ptr;
+#endif
+}
+
+    KOKKOS_INLINE_FUNCTION
+    CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
+#ifndef __CUDA_ARCH__
+      if(ptr!=NULL) {
+        //printf("Release A: %p %p %i\n",this,ptr,ref_count[0]);
+        int count = Kokkos::atomic_fetch_add(ref_count,-1);
+        if(count == 1) {
+          cuda_texture_object_release(obj);
+          delete [] ref_count;
+        }
+      }
+      if(rhs.ptr!=NULL) {
+        obj = rhs.obj;
+        ptr = rhs.ptr;
+        ref_count = rhs.ref_count;
+        Kokkos::atomic_fetch_add(ref_count,1);
+        //printf("Attach A: %p %p %i\n",this,ptr,ref_count[0]);
+      } else {
+        obj = 0;
+        ref_count = NULL;
+        ptr = NULL;
+      }
+#else
+      obj = rhs.obj;
+      ref_count = rhs.ref_count;
+      ptr = rhs.ptr;
+#endif
+      return *this ;
+    }
+
+    explicit KOKKOS_INLINE_FUNCTION
+    CudaTextureFetch( ValueType * const base_view_ptr ) {
+#ifndef __CUDA_ARCH__
+      if( base_view_ptr != NULL ) {
+        obj = cuda_texture_object_attach<AliasType>( base_view_ptr );
+        ref_count = new int[1];
+        ref_count[0] = 1;
+        ptr = base_view_ptr;
+        //printf("Attach PC: %p %p %i\n",this,ptr,ref_count[0]);
+      } else {
+        obj = 0;
+        ref_count = NULL;
+        ptr = NULL;
+      }
+#else
+      cuda_abort("ERROR: Trying to assign a non texture_fetch view to a texture_fetch view in a Device kernel\n.");
+#endif
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
+#ifndef __CUDA_ARCH__
+      if(ptr!=NULL) {
+        //printf("Release P: %p %p %i\n",this,ptr,ref_count[0]);
+        int count = Kokkos::atomic_fetch_add(ref_count,-1);
+        if(count == 1) {
+          cuda_texture_object_release(obj);
+          delete [] ref_count;
+        }
+      }
+      if( base_view_ptr != NULL ) {
+        obj = cuda_texture_object_attach<AliasType>( base_view_ptr );
+        ref_count = new int[1];
+        ref_count[0] = 1;
+        ptr = base_view_ptr;
+        //printf("Attach P: %p %p %i\n",this,ptr,ref_count[0]);
+      } else {
+        obj = 0;
+        ref_count = NULL;
+        ptr = NULL;
+      }
+#else
+      cuda_abort("ERROR: Trying to assign a non texture_fetch view to a texture_fetch view in a Device kernel\n.");
+#endif
+      return *this;
+    }
+
+    template< typename iType >
+    KOKKOS_INLINE_FUNCTION
+    ValueType operator[]( const iType & i ) const
+    {
+  #if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+  // Enable the usage of the _ldg intrinsic even in cases where texture fetches work
+  // Currently texture fetches are faster, but that might change in the future
+  #ifdef KOKKOS_USE_LDG_INTRINSIC
+      return _ldg(&ptr[i]);
+  #else
+      AliasType v = tex1Dfetch<AliasType>( obj , i );
+
+      return  *(reinterpret_cast<ValueType*> (&v));
+  #endif
+  #else
+      return ptr[ i ];
+  #endif
+    }
+
+};
+
+template< typename ValueType >
+struct CudaTextureFetch< const ValueType, void > {
+
+  const ValueType * ptr ;
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch() : ptr(0) {};
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaTextureFetch() {
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs ) : ptr(rhs.ptr) {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
+    ptr = rhs.ptr;
+    return *this ;
+  }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( ValueType * const base_view_ptr ) {
+    ptr = base_view_ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
+    ptr = base_view_ptr;
+    return *this;
+  }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+  {
+  #if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+    return _ldg(&ptr[i]);
+  #else
+    return ptr[ i ];
+  #endif
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
+ *          if 'const' value type, CudaSpace and random access.
+ */
+template<class ViewTraits>
+class ViewDataHandle<ViewTraits,
+typename enable_if<is_same<typename ViewTraits::memory_space,CudaSpace>::value &&
+                   is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value &&
+                   ViewTraits::memory_traits::RandomAccess
+                  >::type> {
+  typedef ViewDataHandle self_type;
+public:
+  enum {ReferenceAble = 0};
+  typedef Impl::CudaTextureFetch<typename ViewTraits::value_type> type;
+  typedef typename ViewTraits::value_type return_type;
+
+  static type allocate(std::string label, size_t count) {
+    return type((typename ViewTraits::value_type*)
+                typename ViewTraits::memory_space::allocate( label ,
+                typeid(typename ViewTraits::value_type) ,
+                sizeof(typename ViewTraits::value_type) ,
+                count ));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static typename ViewTraits::value_type* get_raw_ptr(type handle) {
+    return handle.ptr;
+  }
+};
+
+}
+}
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
@ -0,0 +1,103 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ABORT_HPP
+#define KOKKOS_CUDA_ABORT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
+
+#include <cuda.h>
+
+#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 )
+#error "Cuda version 4.1 or greater required"
+#endif
+
+#if ( __CUDA_ARCH__ < 200 )
+#error "Cuda device capability 2.0 or greater required"
+#endif
+
+extern "C" {
+/*  Cuda runtime function, declared in <crt/device_runtime.h>
+ *  Requires capability 2.x or better.
+ */
+extern __device__ void __assertfail(
+  const void  *message,
+  const void  *file,
+  unsigned int line,
+  const void  *function,
+  size_t       charsize);
+}
+
+namespace Kokkos {
+
+__device__ inline
+void cuda_abort( const char * const message )
+{
+  const char empty[] = "" ;
+
+  __assertfail( (const void *) message ,
+                (const void *) empty ,
+                (unsigned int) 0 ,
+                (const void *) empty ,
+                sizeof(char) );
+}
+
+} // namespace Kokkos
+
+#else
+
+namespace Kokkos {
+KOKKOS_INLINE_FUNCTION
+void cuda_abort( const char * const ) {}
+}
+
+#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@ -0,0 +1,199 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Atomic.hpp
+/// \brief Atomic functions
+///
+/// This header file defines prototypes for the following atomic functions:
+///   - exchange
+///   - compare and exchange
+///   - add
+///
+/// Supported types include:
+///   - signed and unsigned 4 and 8 byte integers
+///   - float
+///   - double
+///
+/// They are implemented through GCC compatible intrinsics, OpenMP
+/// directives and native CUDA intrinsics.
+///
+/// Including this header file requires one of the following
+/// compilers:
+///   - NVCC (for CUDA device code only)
+///   - GCC (for host code only)
+///   - Intel (for host code only)
+///   - A compiler that supports OpenMP 3.1 (for host code only)
+
+#ifndef KOKKOS_ATOMIC_HPP
+#define KOKKOS_ATOMIC_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+
+#if defined( __CUDA_ARCH__ )
+
+// Compiling NVIDIA device code, must use Cuda atomics:
+
+#define KOKKOS_ATOMICS_USE_CUDA
+
+#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
+      ! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
+      ! defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+// Compiling for non-Cuda atomic implementation has not been pre-selected.
+// Choose the best implementation for the detected compiler.
+// Preference: GCC, INTEL, OMP31
+
+#if defined( KOKKOS_COMPILER_GNU ) || \
+    defined( KOKKOS_COMPILER_CLANG )
+
+#define KOKKOS_ATOMICS_USE_GCC
+
+#elif defined( KOKKOS_COMPILER_INTEL ) || \
+      defined( KOKKOS_COMPILER_CRAYC )
+
+#define KOKKOS_ATOMICS_USE_INTEL
+
+#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
+
+#define KOKKOS_ATOMICS_USE_OMP31
+
+#else
+
+#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
+
+#endif
+
+#endif /* Not pre-selected atomic implementation */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+
+inline
+const char * atomic_query_version()
+{
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+  return "KOKKOS_ATOMICS_USE_CUDA" ;
+#elif defined( KOKKOS_ATOMICS_USE_GCC )
+  return "KOKKOS_ATOMICS_USE_GCC" ;
+#elif defined( KOKKOS_ATOMICS_USE_INTEL )
+  return "KOKKOS_ATOMICS_USE_INTEL" ;
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+  return "KOKKOS_ATOMICS_USE_OMP31" ;
+#endif
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+// Atomic exchange
+//
+// template< typename T >
+// T atomic_exchange( volatile T* const dest , const T val )
+// { T tmp = *dest ; *dest = val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Exchange.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic compare-and-exchange
+//
+// template<class T>
+// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; }
+
+#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and add
+//
+// template<class T>
+// T atomic_fetch_add(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest += val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and or
+//
+// template<class T>
+// T atomic_fetch_or(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp | val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Or.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and and
+//
+// template<class T>
+// T atomic_fetch_and(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp & val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_And.hpp"
+
+//----------------------------------------------------------------------------
+// Memory fence
+//
+// All loads and stores from this thread will be globally consistent before continuing
+//
+// void memory_fence() {...};
+#include "impl/Kokkos_Memory_Fence.hpp"
+
+//----------------------------------------------------------------------------
+// Provide volatile_load and safe_load
+//
+// T volatile_load(T const volatile * const ptr);
+//
+// T const& safe_load(T const * const ptr);
+// XEON PHI
+// T safe_load(T const * const ptr
+
+#include "impl/Kokkos_Volatile_Load.hpp"
+
+#include "impl/Kokkos_Atomic_Generic.hpp"
+
+#endif /* KOKKOS_ATOMIC_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -0,0 +1,85 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_HPP
+#define KOKKOS_CORE_HPP
+
+//----------------------------------------------------------------------------
+// Include the execution space header files for the enabled execution spaces.
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+#include <Kokkos_Cuda.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+#include <Kokkos_OpenMP.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+#include <Kokkos_Threads.hpp>
+#endif
+
+#include <Kokkos_Serial.hpp>
+
+#include <Kokkos_View.hpp>
+#include <Kokkos_Vectorization.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_hwloc.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+  void initialize();
+
+  void initialize(int narg, char* arg[]);
+
+  void finalize();
+  
+  void fence();
+}
+
+#endif
--- a/lib/kokkos/core/src/Kokkos_CrsArray.hpp
+++ b/lib/kokkos/core/src/Kokkos_CrsArray.hpp
@ -0,0 +1,170 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CRSARRAY_HPP
+#define KOKKOS_CRSARRAY_HPP
+
+#include <string>
+#include <vector>
+
+#include <Kokkos_View.hpp>
+
+namespace Kokkos {
+
+/// \class CrsArray
+/// \brief Compressed row storage array.
+///
+/// \tparam DataType The type of stored entries.  If a CrsArray is
+///   used as the graph of a sparse matrix, then this is usually an
+///   integer type, the type of the column indices in the sparse
+///   matrix.
+///
+/// \tparam Arg1Type The second template parameter, corresponding
+///   either to the Device type (if there are no more template
+///   parameters) or to the Layout type (if there is at least one more
+///   template parameter).
+///
+/// \tparam Arg2Type The third template parameter, which if provided
+///   corresponds to the Device type.
+///
+/// \tparam SizeType The type of row offsets.  Usually the default
+///   parameter suffices.  However, setting a nondefault value is
+///   necessary in some cases, for example, if you want to have a
+///   sparse matrices with dimensions (and therefore column indices)
+///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
+///   entries in the sparse matrix.
+///
+/// A row has a range of entries:
+/// <ul>
+/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
+/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
+/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
+/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
+/// </ul>
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type = void,
+          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
+class CrsArray {
+private:
+  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
+
+public:
+  typedef DataType                                            data_type;
+  typedef typename traits::array_layout                       array_layout;
+  typedef typename traits::device_type                        device_type;
+  typedef SizeType                                            size_type;
+
+  typedef CrsArray< DataType , Arg1Type , Arg2Type , SizeType > crsarray_type;
+  typedef CrsArray< DataType , array_layout , typename device_type::host_mirror_device_type , SizeType > HostMirror;
+  typedef View< const size_type* , array_layout, device_type >  row_map_type;
+  typedef View<       DataType*  , array_layout, device_type >  entries_type;
+
+  entries_type entries;
+  row_map_type row_map;
+
+  //! Construct an empty view.
+  CrsArray () : entries(), row_map() {}
+
+  //! Copy constructor (shallow copy).
+  CrsArray (const CrsArray& rhs) : entries (rhs.entries), row_map (rhs.row_map)
+  {}
+
+  /** \brief  Assign to a view of the rhs array.
+   *          If the old view is the last view
+   *          then allocated memory is deallocated.
+   */
+  CrsArray& operator= (const CrsArray& rhs) {
+    entries = rhs.entries;
+    row_map = rhs.row_map;
+    return *this;
+  }
+
+  /**  \brief  Destroy this view of the array.
+   *           If the last view then allocated memory is deallocated.
+   */
+  ~CrsArray() {}
+};
+
+//----------------------------------------------------------------------------
+
+template< class CrsArrayType , class InputSizeType >
+typename CrsArrayType::crsarray_type
+create_crsarray( const std::string & label ,
+                 const std::vector< InputSizeType > & input );
+
+template< class CrsArrayType , class InputSizeType >
+typename CrsArrayType::crsarray_type
+create_crsarray( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input );
+
+//----------------------------------------------------------------------------
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_CrsArray_factory.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CRSARRAY_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@ -0,0 +1,285 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_HPP
+#define KOKKOS_CUDA_HPP
+
+#include <Kokkos_Macros.hpp>
+
+//----------------------------------------------------------------------------
+// If CUDA execution space is enabled then use this header file.
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#if defined( __CUDACC__ )
+
+#include <cuda.h>
+
+/*  Compiling with a CUDA compiler.
+ *
+ *  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
+ *    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
+ *
+ *  When generating device code the __CUDA_ARCH__ macro is defined as:
+ *    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
+ */
+#if ! defined( CUDA_VERSION )
+#error "#include <cuda.h> did not define CUDA_VERSION"
+#endif
+
+#if ( CUDA_VERSION < 4010 )
+#error "Cuda version 4.1 or greater required"
+#endif
+
+#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 200 )
+/*  Compiling with CUDA compiler for device code. */
+#error "Cuda device capability >= 2.0 is required"
+#endif
+
+#endif /* #if defined( __CUDACC__ ) */
+
+//----------------------------------------------------------------------------
+
+#include <iosfwd>
+#include <vector>
+
+#if defined( KOKKOS_HAVE_OPENMP )
+#include <Kokkos_OpenMP.hpp>
+#elif defined( KOKKOS_HAVE_PTHREAD )
+#include <Kokkos_Threads.hpp>
+#else
+#endif
+
+#include <Kokkos_Serial.hpp>
+
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_CudaSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class CudaExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class Cuda
+/// \brief Kokkos device that uses CUDA to run on GPUs.
+///
+/// A "device" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads device uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
+/// extensions, and the Serial device executes "parallel" kernels
+/// sequentially.  The Cuda device uses NVIDIA's CUDA programming
+/// model to execute kernels in parallel on GPUs.
+class Cuda {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! The tag (what type of kokkos_object is this).
+  typedef Impl::ExecutionSpaceTag  kokkos_tag ;
+  //! The device type (same as this class).
+  typedef Cuda                  device_type ;
+  //! This device's execution space.
+  typedef Cuda                  execution_space ;
+  //! This device's preferred memory space.
+  typedef CudaSpace             memory_space ;
+  //! The size_type typedef best suited for this device.
+  typedef CudaSpace::size_type  size_type ;
+  //! This device's preferred array layout.
+  typedef LayoutLeft            array_layout ;
+
+  typedef ScratchMemorySpace< Cuda >  scratch_memory_space ;
+
+  //! This device's host mirror type.
+#if defined( KOKKOS_HAVE_OPENMP )
+  typedef Kokkos::OpenMP       host_mirror_device_type ;
+#elif defined( KOKKOS_HAVE_PTHREAD )
+  typedef Kokkos::Threads      host_mirror_device_type ;
+#else
+  typedef Kokkos::Serial       host_mirror_device_type ;
+#endif
+
+  //@}
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#if defined( __CUDA_ARCH__ )
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  //! Free any resources being consumed by the device.
+  static void finalize();
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  //--------------------------------------------------------------------------
+  //! \name Device-specific functions
+  //@{
+
+  struct SelectDevice {
+    int cuda_device_id ;
+    SelectDevice() : cuda_device_id(0) {}
+    explicit SelectDevice( int id ) : cuda_device_id( id ) {}
+  };
+
+  //! Initialize, telling the CUDA run-time library which device to use.
+  static void initialize( const SelectDevice = SelectDevice() );
+  static void initialize( int device );
+  static void initialize( int device , int );
+
+  static int is_initialized();
+
+  /// \brief Cuda device architecture of the selected device.
+  ///
+  /// This matches the __CUDA_ARCH__ specification.
+  static size_type device_arch();
+
+  //! Query device count.
+  static size_type detect_device_count();
+
+  /** \brief  Detect the available devices and their architecture
+   *          as defined by the __CUDA_ARCH__ specification.
+   */
+  static std::vector<unsigned> detect_device_arch();
+
+  static unsigned team_max();
+  static unsigned team_recommended();
+
+  //@}
+  //--------------------------------------------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Cuda::memory_space
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::HostSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_View.hpp>
+#include <Cuda/Kokkos_Cuda_Parallel.hpp>
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDA_HPP */
+
+
+
--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@ -0,0 +1,181 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDASPACE_HPP
+#define KOKKOS_CUDASPACE_HPP
+
+#if defined( __CUDACC__ )
+#include <cuda_runtime.h>
+#endif
+
+#include <iosfwd>
+#include <typeinfo>
+#include <string>
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Cuda memory management */
+
+class CudaSpace {
+public:
+
+  typedef Impl::MemorySpaceTag  kokkos_tag ;
+  typedef CudaSpace             memory_space ;
+  typedef unsigned int          size_type ;
+
+#if defined( KOKKOS_HAVE_CUDA )
+  typedef Kokkos::Cuda  execution_space ;
+#endif
+
+  /** \brief  Allocate a contiguous block of memory on the Cuda device
+   *          with size = scalar_size * scalar_count.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   *
+   *  Allocation may only occur on the master thread of the process.
+   */
+  static void * allocate( const std::string    & label ,
+                          const std::type_info & scalar_type ,
+                          const size_t           scalar_size ,
+                          const size_t           scalar_count );
+
+  /** \brief  Increment the reference count of the block of memory
+   *          in which the input pointer resides.
+   *
+   *          Reference counting only occurs on the master thread.
+   */
+  static void increment( const void * );
+
+  /** \brief  Decrement the reference count of the block of memory
+   *          in which the input pointer resides.  If the reference
+   *          count falls to zero the memory is deallocated.
+   *
+   *          Reference counting only occurs on the master thread.
+   */
+  static void decrement( const void * );
+
+  /** \brief  Print all tracked memory to the output stream. */
+  static void print_memory_view( std::ostream & );
+
+  /** \brief  Retrieve label associated with the input pointer */
+  static std::string query_label( const void * );
+
+  /*--------------------------------*/
+
+  static void access_error();
+  static void access_error( const void * const );
+
+  /*--------------------------------*/
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct DeepCopy<HostSpace,CudaSpace> {
+  DeepCopy( void * dst , const void * src , size_t );
+};
+
+template<>
+struct DeepCopy<CudaSpace,HostSpace> {
+  DeepCopy( void * dst , const void * src , size_t );
+};
+
+template<>
+struct DeepCopy<CudaSpace,CudaSpace> {
+  DeepCopy( void * dst , const void * src , size_t );
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** Running in HostSpace attempting to access CudaSpace */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaSpace >
+{
+#if defined( KOKKOS_USE_CUDA_UVM )
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+#else
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+#endif
+};
+
+/** Running in CudaSpace attempting to access HostSpace */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::HostSpace >
+{
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::cuda_abort("Cuda code attempted to access HostSpace memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::cuda_abort("Cuda code attempted to access HostSpace memory"); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_CUDASPACE_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_CudaTypes.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaTypes.hpp
@ -0,0 +1,139 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDATYPES_HPP
+#define KOKKOS_CUDATYPES_HPP
+
+#include <Kokkos_Macros.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ )
+
+namespace Kokkos {
+
+typedef ::int2 int2 ;
+typedef ::int3 int3 ;
+typedef ::int4 int4 ;
+
+typedef ::float2 float2 ;
+typedef ::float3 float3 ;
+typedef ::float4 float4 ;
+
+typedef ::double2 double2 ;
+typedef ::double3 double3 ;
+typedef ::double4 double4 ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else /* NOT #if defined( __CUDACC__ ) */
+
+namespace Kokkos {
+
+struct int2 {
+        int x;
+        int y;
+};
+
+struct int3 {
+        int x;
+        int y;
+        int z;
+};
+
+struct int4 {
+        int x;
+        int y;
+        int z;
+        int w;
+};
+
+struct float2 {
+        float x;
+        float y;
+};
+
+struct float3 {
+        float x;
+        float y;
+        float z;
+};
+
+struct float4 {
+        float x;
+        float y;
+        float z;
+        float w;
+};
+
+struct double2 {
+        double x;
+        double y;
+};
+
+struct double3 {
+        double x;
+        double y;
+        double z;
+};
+
+struct double4 {
+        double x;
+        double y;
+        double z;
+        double w;
+};
+
+} // namespace Kokkos
+
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_CUDATYPES_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@ -0,0 +1,223 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXECPOLICY_HPP
+#define KOKKOS_EXECPOLICY_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Execution policy for work over a range of an integral type.
+ */
+template< class ExecSpace  = Kokkos::DefaultExecutionSpace
+        , class WorkArgTag = void
+        , typename IntType = int
+        , unsigned GranularityPowerOfTwo = 3 /* Chunk size 8 */
+        >
+class RangePolicy {
+private:
+
+  enum { Granularity     = IntType(1) << GranularityPowerOfTwo };
+  enum { GranularityMask = IntType(Granularity) - 1 };
+
+  IntType m_begin ;
+  IntType m_end ;
+
+public:
+
+  typedef Impl::ExecutionPolicyTag   kokkos_tag ;      ///< Concept tag
+  typedef ExecSpace                  execution_space ; ///< Execution type
+  typedef IntType                    member_type ;
+
+  KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
+  KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
+
+  KOKKOS_INLINE_FUNCTION RangePolicy() : m_begin(0), m_end(0) {}
+
+  /** \brief  Total range */
+  KOKKOS_INLINE_FUNCTION
+  RangePolicy( const member_type work_begin
+             , const member_type work_end
+             )
+    : m_begin( work_begin < work_end ? work_begin : 0 )
+    , m_end(   work_begin < work_end ? work_end : 0 )
+    {}
+
+  /** \brief  Subrange for a partition's rank and size.
+   *
+   *  Typically used to partition a range over a group of threads.
+   */
+  KOKKOS_INLINE_FUNCTION
+  RangePolicy( const RangePolicy & range
+             , const int part_rank
+             , const int part_size
+             )
+    : m_begin(0), m_end(0)
+    {
+      if ( part_size ) {
+
+        // Split evenly among partitions, then round up to the granularity.
+        const member_type work_part =
+          ( ( ( ( range.m_end - range.m_begin ) + ( part_size - 1 ) ) / part_size ) + GranularityMask ) & ~member_type(GranularityMask);
+
+        m_begin = range.m_begin + work_part * part_rank ;
+        m_end   = m_begin       + work_part ;
+
+        if ( range.m_end < m_begin ) m_begin = range.m_end ;
+        if ( range.m_end < m_end )   m_end   = range.m_end ;
+      }
+    }
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Execution policy for parallel work over a league of teams of threads.
+ *
+ *  The work functor is called for each thread of each team such that
+ *  the team's member threads are guaranteed to be concurrent.
+ *
+ *  The team's threads have access to team shared scratch memory and
+ *  team collective operations.
+ *
+ *  If the WorkArgTag is non-void then the first calling argument of the
+ *  work functor's parentheses operator is 'const WorkArgTag &'.
+ *  This allows a functor to have multiple work member functions.
+ */
+template< class ExecSpace  = DefaultExecutionSpace
+        , class WorkArgTag = void >
+class TeamPolicy {
+public:
+
+  typedef Impl::ExecutionPolicyTag   kokkos_tag ;      ///< Concept tag
+  typedef ExecSpace                  execution_space ; ///< Execution space
+
+  /** \brief  Query maximum team size for a given functor.
+   *
+   *  This size takes into account execution space concurrency limitations and
+   *  scratch memory space limitations for reductions, team reduce/scan, and
+   *  team shared memory.
+   */
+  template< class FunctorType >
+  static int team_size_max( const FunctorType & );
+
+  /** \brief  Construct policy with the given instance of the execution space */
+  TeamPolicy( execution_space & , int league_size_request , int team_size_request );
+
+  /** \brief  Construct policy with the default instance of the execution space */
+  TeamPolicy( int league_size_request , int team_size_request );
+
+  /** \brief  The actual league size (number of teams) of the policy.
+   *
+   *  This may be smaller than the requested league size due to limitations
+   *  of the execution space.
+   */
+  KOKKOS_INLINE_FUNCTION int league_size() const ;
+
+  /** \brief  The actual team size (number of threads per team) of the policy.
+   *
+   *  This may be smaller than the requested team size due to limitations
+   *  of the execution space.
+   */
+  KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+  /** \brief  Parallel execution of a functor calls the functor once with
+   *          each member of the execution policy.
+   */
+  struct member_type {
+
+    /** \brief  Handle to the currently executing team shared scratch memory */
+    KOKKOS_INLINE_FUNCTION
+    typename execution_space::scratch_memory_space team_shmem() const ;
+
+    /** \brief  Rank of this team within the league of teams */
+    KOKKOS_INLINE_FUNCTION int league_rank() const ;
+
+    /** \brief  Number of teams in the league */
+    KOKKOS_INLINE_FUNCTION int league_size() const ;
+
+    /** \brief  Rank of this thread within this team */
+    KOKKOS_INLINE_FUNCTION int team_rank() const ;
+
+    /** \brief  Number of threads in this team */
+    KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+    /** \brief  Barrier among the threads of this team */
+    KOKKOS_INLINE_FUNCTION void team_barrier();
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+     *
+     *  The highest rank thread can compute the reduction total as
+     *    reduction_total = dev.team_scan( value ) + value ;
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value );
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+     *          with intra-team non-deterministic ordering accumulation.
+     *
+     *  The global inter-team accumulation value will, at the end of the
+     *  league's parallel execution, be the scan's total.
+     *  Parallel execution ordering of the league's teams is non-deterministic.
+     *  As such the base value for each team's scan operation is similarly
+     *  non-deterministic.
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum );
+  };
+};
+
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_EXECPOLICY_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@ -0,0 +1,158 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HOSTSPACE_HPP
+#define KOKKOS_HOSTSPACE_HPP
+
+#include <iosfwd>
+#include <typeinfo>
+#include <string>
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_MemoryTracking.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Memory management on the host for devices */
+
+class HostSpace {
+public:
+
+  typedef Impl::MemorySpaceTag  kokkos_tag ;
+  typedef HostSpace             memory_space ;
+  typedef size_t                size_type ;
+
+  // Default execution space for this memory space
+#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef Kokkos::OpenMP   execution_space ;
+#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Kokkos::Threads  execution_space ;
+#elif defined( KOKKOS_HAVE_OPENMP )
+  typedef Kokkos::OpenMP   execution_space ;
+#elif defined( KOKKOS_HAVE_PTHREAD )
+  typedef Kokkos::Threads  execution_space ;
+#else
+  typedef Kokkos::Serial   execution_space ;
+#endif
+
+  /** \brief  Allocate a contiguous block of memory on the Cuda device
+   *          with size = scalar_size * scalar_count.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   *
+   *  Allocation may only occur on the master thread of the process.
+   */
+  static void * allocate( const std::string    & label ,
+                          const std::type_info & scalar_type ,
+                          const size_t           scalar_size ,
+                          const size_t           scalar_count );
+
+  /** \brief  Increment the reference count of the block of memory
+   *          in which the input pointer resides.
+   *
+   *          Reference counting only occurs on the master thread.
+   */
+  static void increment( const void * );
+
+  /** \brief  Decrement the reference count of the block of memory
+   *          in which the input pointer resides.  If the reference
+   *          count falls to zero the memory is deallocated.
+   *
+   *          Reference counting only occurs on the master thread.
+   */
+  static void decrement( const void * );
+
+  /*--------------------------------*/
+
+  /** \brief  Print all tracked memory to the output stream. */
+  static void print_memory_view( std::ostream & );
+
+  /** \brief  Retrieve label associated with the input pointer */
+  static std::string query_label( const void * );
+
+  /*--------------------------------*/
+  /* Functions unique to the HostSpace */
+
+  static int in_parallel();
+
+  static void register_in_parallel( int (*)() );
+};
+
+//----------------------------------------------------------------------------
+
+template< class ExecutionSpace , class DataSpace >
+struct VerifyExecutionSpaceCanAccessDataSpace ;
+
+template<>
+struct VerifyExecutionSpaceCanAccessDataSpace< HostSpace , HostSpace >
+{
+  inline static void verify(void) {}
+  inline static void verify(const void *) {}
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class , class > struct DeepCopy ;
+
+template<>
+struct DeepCopy<HostSpace,HostSpace> {
+  DeepCopy( void * dst , const void * src , size_t n );
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_HOSTSPACE_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Layout.hpp
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@ -0,0 +1,180 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Layout.hpp
+/// \brief Declaration of various \c MemoryLayout options.
+
+#ifndef KOKKOS_LAYOUT_HPP
+#define KOKKOS_LAYOUT_HPP
+
+#include <stddef.h>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+/// \struct LayoutLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Layout left" indicates a mapping where the leftmost index i0
+/// refers to contiguous access, and strides increase for dimensions
+/// going right from there (i1, i2, ...).  This layout imitates how
+/// Fortran stores multi-dimensional arrays.  For the special case of
+/// a two-dimensional array, "layout left" is also called "column
+/// major."
+struct LayoutLeft {
+  //! The tag (what type of kokkos_object is this).
+  typedef Impl::LayoutTag       kokkos_tag ;
+  typedef LayoutLeft array_layout ;
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutRight
+/// \brief Memory layout tag indicating right-to-left (C or
+///   lexigraphical scheme) striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Right layout" indicates a mapping where the rightmost index ik
+/// refers to contiguous access, and strides increase for dimensions
+/// going left from there.  This layout imitates how C stores
+/// multi-dimensional arrays.  For the special case of a
+/// two-dimensional array, "layout right" is also called "row major."
+struct LayoutRight {
+  //! The tag (what type of kokkos_object is this).
+  typedef Impl::LayoutTag       kokkos_tag ;
+  typedef LayoutRight array_layout ;
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutStride
+/// \brief  Memory layout tag indicated arbitrarily strided
+///         multi-index mapping into contiguous memory.
+struct LayoutStride {
+
+  //! The tag (what type of kokkos_object is this).
+  typedef Impl::LayoutTag       kokkos_tag ;
+
+  typedef LayoutStride array_layout ;
+
+  enum { MAX_RANK = 8 };
+
+  size_t dimension[ MAX_RANK ] ;
+  size_t stride[ MAX_RANK ] ; 
+
+  /** \brief  Compute strides from ordered dimensions.
+   *
+   *  Values of order uniquely form the set [0..rank)
+   *  and specify ordering of the dimensions.
+   *  Order = {0,1,2,...} is LayoutLeft
+   *  Order = {...,2,1,0} is LayoutRight
+   */
+  template< typename iTypeOrder , typename iTypeDimen >
+  KOKKOS_INLINE_FUNCTION static
+  LayoutStride order_dimensions( int const rank
+                               , iTypeOrder const * const order
+                               , iTypeDimen const * const dimen )
+    {
+      LayoutStride tmp ;
+      // Verify valid rank order:
+      int check_input = MAX_RANK < rank ? 0 : int( 1 << rank ) - 1 ;
+      for ( int r = 0 ; r < MAX_RANK ; ++r ) {
+        tmp.dimension[r] = 0 ;
+        tmp.stride[r]    = 0 ;
+        check_input &= ~int( 1 << order[r] );
+      }
+      if ( 0 == check_input ) {
+        size_t n = 1 ;
+        for ( int r = 0 ; r < rank ; ++r ) {
+          tmp.stride[ order[r] ] = n ;
+          n *= ( tmp.dimension[r] = dimen[r] );
+        }
+      }
+      return tmp ;
+    }
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutTileLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices by tiles.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Tiled layout" indicates a mapping to contiguously stored
+/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
+/// dimensions.  Indices are LayoutLeft within each tile, and the
+/// tiles themselves are arranged using LayoutLeft.  Note that the
+/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be
+/// compile-time constants.  This speeds up index calculations.  If
+/// both tile dimensions are powers of two, Kokkos can optimize
+/// further.
+template < unsigned ArgN0 , unsigned ArgN1 ,
+           bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value &&
+                                 Impl::is_power_of_two<ArgN1>::value )
+         >
+struct LayoutTileLeft {
+  //! The tag (what type of kokkos_object is this).
+  typedef Impl::LayoutTag       kokkos_tag ;
+
+  typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
+  enum { N0 = ArgN0 };
+  enum { N1 = ArgN1 };
+};
+
+} // namespace Kokkos
+
+#endif // #ifndef KOKKOS_LAYOUT_HPP
+
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -0,0 +1,231 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MACROS_HPP
+#define KOKKOS_MACROS_HPP
+
+//----------------------------------------------------------------------------
+/** Pick up configure/build options via #define macros:
+ *
+ *  KOKKOS_HAVE_CUDA                Kokkos::Cuda execution and memory spaces
+ *  KOKKOS_HAVE_PTHREAD             Kokkos::Threads execution space
+ *  KOKKOS_HAVE_QTHREAD             Kokkos::Qthread execution space
+ *  KOKKOS_HAVE_OPENMP              Kokkos::OpenMP  execution space
+ *  KOKKOS_HAVE_HWLOC               HWLOC library is available
+ *  KOKKOS_HAVE_EXPRESSION_CHECK    insert array bounds checks, is expensive!
+ *  KOKKOS_HAVE_CXX11               enable C++11 features
+ *
+ *  KOKKOS_HAVE_MPI                 negotiate MPI/execution space interactions
+ *
+ *  KOKKOS_USE_CUDA_UVM             Use CUDA UVM for Cuda memory space
+ */
+
+#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
+#include <KokkosCore_config.h>
+#endif
+
+//----------------------------------------------------------------------------
+/** Pick up compiler specific #define macros:
+ *
+ *  Macros for known compilers evaluate to an integral version value
+ *
+ *  KOKKOS_COMPILER_NVCC
+ *  KOKKOS_COMPILER_GNU
+ *  KOKKOS_COMPILER_INTEL
+ *  KOKKOS_COMPILER_IBM
+ *  KOKKOS_COMPILER_CRAYC
+ *  KOKKOS_COMPILER_APPLECC
+ *  KOKKOS_COMPILER_CLANG
+ *  KOKKOS_COMPILER_PGI
+ *
+ *  Macros for which compiler extension to use for atomics on intrinsice types
+ *
+ *  KOKKOS_ATOMICS_USE_CUDA
+ *  KOKKOS_ATOMICS_USE_GNU
+ *  KOKKOS_ATOMICS_USE_INTEL
+ *  KOKKOS_ATOMICS_USE_OPENMP31
+ *
+ *  A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
+ *
+ *  Macros for marking functions to run in an execution space:
+ *
+ *  KOKKOS_FUNCTION
+ *  KOKKOS_INLINE_FUNCTION        request compiler to inline
+ *  KOKKOS_FORCEINLINE_FUNCTION   force compiler to inline, use with care!
+ */
+
+#include <impl/Kokkos_Compiler_Macros.hpp>
+
+/** Define function marking macros if compiler specific macros are undefined: */
+
+#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+#define KOKKOS_FORCEINLINE_FUNCTION  inline
+#endif
+
+#if ! defined( KOKKOS_INLINE_FUNCTION )
+#define KOKKOS_INLINE_FUNCTION  inline
+#endif
+
+#if ! defined( KOKKOS_FUNCTION )
+#define KOKKOS_FUNCTION /**/
+#endif
+
+/** These should be part of the Atomics API */
+
+#if ! defined( KOKKOS_NONTEMPORAL_PREFETCH_LOAD )
+    #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
+    #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Non-macro forward declaration placement in this file to be reconsidered...
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Forward declarations for enabled execution and memory spaces.
+
+namespace Kokkos {
+
+class Serial ;    ///< Execution space for serial on CPU
+class HostSpace ; ///< Memory space for Serial, Threads, and OpenMP
+
+class Threads ; ///< Pthreads execution space
+
+#if defined( KOKKOS_HAVE_CUDA )
+class CudaSpace ; ///< Cuda memory space
+class Cuda ;      ///< Cuda execution space
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+class OpenMP ; ///< OpenMP execution space
+#endif
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Set the default execution space.
+
+/// Define Kokkos::DefaultExecutionSpace as per configuration option
+/// or chosen from the enabled execution spaces in the following order:
+/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
+
+namespace Kokkos {
+
+#if 1 < ( ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
+
+#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
+
+#endif
+
+#if   defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
+  typedef Kokkos::Cuda DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultExecutionSpace ;
+#elif  defined ( KOKKOS_HAVE_CUDA )
+  #define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA
+  typedef Kokkos::Cuda DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_OPENMP )
+  #define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP
+  typedef Kokkos::OpenMP DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_PTHREAD )
+  #define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS
+  typedef Kokkos::Threads DefaultExecutionSpace ;
+#else
+  #define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL
+  typedef Kokkos::Serial DefaultExecutionSpace ;
+#endif
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
+typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace ;
+#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+#else
+typedef Kokkos::HostSpace  ActiveExecutionMemorySpace ;
+#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#endif
+
+template< class ActiveSpace , class MemorySpace >
+struct VerifyExecutionCanAccessMemorySpace {};
+
+template< class Space >
+struct VerifyExecutionCanAccessMemorySpace< Space , Space >
+{
+  KOKKOS_INLINE_FUNCTION static void verify(void) {}
+  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+// Currently executing in the CUDA space
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_MACROS_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MEMORYTRAITS_HPP
+#define KOKKOS_MEMORYTRAITS_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Memory access traits for views, an extension point.
+ *
+ *  These traits should be orthogonal.  If there are dependencies then
+ *  the MemoryTraits template must detect and enforce dependencies.
+ *
+ *  A zero value is the default for a View, indicating that none of
+ *  these traits are present.
+ */
+enum MemoryTraitsFlags
+  { Unmanaged  = 0x01
+  , RandomAccess = 0x02
+  , Atomic = 0x04
+  };
+
+template < unsigned T >
+struct MemoryTraits {
+  //! The tag (what type of kokkos_object is this).
+  typedef Impl::MemoryTraitsTag       kokkos_tag ;
+
+  enum { Unmanaged  = T & unsigned(Kokkos::Unmanaged) };
+  enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
+  enum { Atomic = T & unsigned(Kokkos::Atomic) };
+
+  typedef MemoryTraits memory_traits ;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+typedef Kokkos::MemoryTraits<0> MemoryManaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryRandomAccess ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief Memory alignment settings
+ *
+ *  Sets global value for memory alignment.  Must be a power of two!
+ *  Enable compatibility of views from different devices with static stride.
+ *  Use compiler flag to enable overwrites.
+ */
+enum { MEMORY_ALIGNMENT =
+#if defined( KOKKOS_MEMORY_ALIGNMENT )
+    ( 1 << Kokkos::Impl::power_of_two< KOKKOS_MEMORY_ALIGNMENT >::value )
+#else
+    ( 1 << Kokkos::Impl::power_of_two< 128 >::value )
+#endif
+  , MEMORY_ALIGNMENT_THRESHOLD = 4 
+  };
+
+
+} //namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@ -0,0 +1,178 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_HPP
+#define KOKKOS_OPENMP_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP )
+
+#include <omp.h>
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class OpenMP
+/// \brief Kokkos device for multicore processors in the host memory space.
+class OpenMP {
+public:
+  //------------------------------------
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! The tag (what type of kokkos_object is this).
+  typedef Impl::ExecutionSpaceTag  kokkos_tag ;
+  typedef OpenMP                device_type ;
+  typedef OpenMP                execution_space ;
+  typedef HostSpace::size_type  size_type ;
+  typedef HostSpace             memory_space ;
+  typedef LayoutRight           array_layout ;
+  typedef OpenMP                host_mirror_device_type ;
+
+  typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
+
+  //@}
+  //------------------------------------
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  inline static bool in_parallel() { return omp_in_parallel(); }
+
+  /** \brief  Set the device in a "sleep" state. A noop for OpenMP.  */
+  static bool sleep();
+
+  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
+  static void fence() {}
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  /// \brief Free any resources being consumed by the device.
+  static void finalize();
+
+  /** \brief  Initialize the device.
+   *
+   *  1) If the hardware locality library is enabled and OpenMP has not
+   *     already bound threads then bind OpenMP threads to maximize
+   *     core utilization and group for memory hierarchy locality.
+   *
+   *  2) Allocate a HostThread for each OpenMP thread to hold its
+   *     topology and fan in/out data.
+   */
+  static void initialize( unsigned thread_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 );
+
+  static int is_initialized();
+  //@}
+  //------------------------------------
+  /** \brief  This execution space has a topological thread pool which can be queried.
+   *
+   *  All threads within a pool have a common memory space for which they are cache coherent.
+   *    depth = 0  gives the number of threads in the whole pool.
+   *    depth = 1  gives the number of threads in a NUMA region, typically sharing L3 cache.
+   *    depth = 2  gives the number of threads at the finest granularity, typically sharing L1 cache.
+   */
+  inline static int thread_pool_size( int depth = 0 );
+
+  /** \brief  The rank of the executing thread in this thread pool */
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
+
+  //------------------------------------
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+  inline static unsigned team_max()             { return thread_pool_size(1); }
+  inline static unsigned team_recommended()     { return thread_pool_size(2); }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned hardware_thread_id() { return thread_pool_rank(); }
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::OpenMP::memory_space
+  , Kokkos::OpenMP::scratch_memory_space
+  >
+{
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <OpenMP/Kokkos_OpenMPexec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP ) */
+#endif /* #ifndef KOKKOS_OPENMP_HPP */
+
+
--- a/lib/kokkos/core/src/Kokkos_Pair.hpp
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@ -0,0 +1,457 @@
+/// \file Kokkos_Pair.hpp
+/// \brief Declaration and definition of Kokkos::pair.
+///
+/// This header file declares and defines Kokkos::pair and its related
+/// nonmember functions.
+
+#ifndef KOKKOS_PAIR_HPP
+#define KOKKOS_PAIR_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <utility>
+
+namespace Kokkos {
+/// \struct pair
+/// \brief Replacement for std::pair that works on CUDA devices.
+///
+/// The instance methods of std::pair, including its constructors, are
+/// not marked as <tt>__device__</tt> functions.  Thus, they cannot be
+/// called on a CUDA device, such as an NVIDIA GPU.  This struct
+/// implements the same interface as std::pair, but can be used on a
+/// CUDA device as well as on the host.
+template <class T1, class T2>
+struct pair
+{
+  //! The first template parameter of this class.
+  typedef T1 first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Default constructor.
+  ///
+  /// This calls the default constructors of T1 and T2.  It won't
+  /// compile if those default constructors are not defined and
+  /// public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair()
+    : first(), second()
+  {}
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type const& f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, T2> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1& first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1  first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type const& f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2>
+{
+  //! The first template parameter of this class.
+  typedef T1&  first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+//! Equality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first==rhs.first && lhs.second==rhs.second; }
+
+//! Inequality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs==rhs); }
+
+//! Less-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
+
+//! Less-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(rhs<lhs); }
+
+//! Greater-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return rhs<lhs; }
+
+//! Greater-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs<rhs); }
+
+/// \brief Return a new pair.
+///
+/// This is a "nonmember constructor" for Kokkos::pair.  It works just
+/// like std::make_pair.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1,T2> make_pair (T1 x, T2 y)
+{ return ( pair<T1,T2>(x,y) ); }
+
+/// \brief Return a pair of references to the input arguments.
+///
+/// This compares to std::tie (new in C++11).  You can use it to
+/// assign to two variables at once, from the result of a function
+/// that returns a pair.  For example (<tt>__device__</tt> and
+/// <tt>__host__</tt> attributes omitted for brevity):
+/// \code
+/// // Declaration of the function to call.
+/// // First return value: operation count.
+/// // Second return value: whether all operations succeeded.
+/// Kokkos::pair<int, bool> someFunction ();
+///
+/// // Code that uses Kokkos::tie.
+/// int myFunction () {
+///   int count = 0;
+///   bool success = false;
+///
+///   // This assigns to both count and success.
+///   Kokkos::tie (count, success) = someFunction ();
+///
+///   if (! success) {
+///     // ... Some operation failed;
+///     //     take corrective action ...
+///   }
+///   return count;
+/// }
+/// \endcode
+///
+/// The line that uses tie() could have been written like this:
+/// \code
+///   Kokkos::pair<int, bool> result = someFunction ();
+///   count = result.first;
+///   success = result.second;
+/// \endcode
+///
+/// Using tie() saves two lines of code and avoids a copy of each
+/// element of the pair.  The latter could be significant if one or
+/// both elements of the pair are more substantial objects than \c int
+/// or \c bool.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1 &,T2 &> tie (T1 & x, T2 & y)
+{ return ( pair<T1 &,T2 &>(x,y) ); }
+
+//
+// Specialization of Kokkos::pair for a \c void second argument.  This
+// is not actually a "pair"; it only contains one element, the first.
+//
+template <class T1>
+struct pair<T1,void>
+{
+  typedef T1 first_type;
+  typedef void second_type;
+
+  first_type  first;
+  enum { second = 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair()
+    : first()
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(const first_type & f)
+    : first(f)
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(const first_type & f, int)
+    : first(f)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,void> &p)
+    : first(p.first)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, void> & operator=(const pair<U,void> &p)
+  {
+    first = p.first;
+    return *this;
+  }
+};
+
+//
+// Specialization of relational operators for Kokkos::pair<T1,void>.
+//
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first==rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs==rhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first<rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(rhs<lhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return rhs<lhs; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs<rhs); }
+
+} // namespace Kokkos
+
+
+#endif //KOKKOS_PAIR_HPP
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@ -0,0 +1,842 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Parallel.hpp
+/// \brief Declaration of parallel operators
+
+#ifndef KOKKOS_PARALLEL_HPP
+#define KOKKOS_PARALLEL_HPP
+
+#include <cstddef>
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Given a Functor and Execution Policy query an execution space.
+ *
+ *  if       the Policy has an execution space use that
+ *  else if  the Functor has a device_type use that
+ *  else     use the default
+ */
+template< class Functor
+        , class Policy
+        , class EnableFunctor = void
+        , class EnablePolicy  = void
+        >
+struct FunctorPolicyExecutionSpace {
+  typedef Kokkos::DefaultExecutionSpace execution_space ;
+};
+
+template< class Functor , class Policy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::device_type     >::type
+  , typename enable_if_type< typename Policy ::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy , class EnableFunctor >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , EnableFunctor
+  , typename enable_if_type< typename Policy::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy , class EnablePolicy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::device_type >::type
+  , EnablePolicy
+  >
+{
+  typedef typename Functor::device_type execution_space ;
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Enable = void >
+struct ReduceAdapterValueType ;
+
+template< class FunctorType >
+struct ReduceAdapterValueType< FunctorType , typename enable_if_type< typename FunctorType::value_type >::type >
+{
+  typedef typename FunctorType::value_type type ;
+};
+
+/// \class ReduceAdapter
+/// \brief Implementation detail of parallel_reduce.
+///
+/// This is an implementation detail of parallel_reduce.  Users should
+/// skip this and go directly to the nonmember function parallel_reduce.
+template< class FunctorType ,
+          class ValueType = typename ReduceAdapterValueType< FunctorType >::type >
+struct ReduceAdapter ;
+
+//----------------------------------------------------------------------------
+/// \class ParallelFor
+/// \brief Implementation of the ParallelFor operator that has a
+///   partial specialization for the device.
+///
+/// This is an implementation detail of parallel_for.  Users should
+/// skip this and go directly to the nonmember function parallel_for.
+template< class FunctorType 
+        , class ExecPolicy 
+        , class ExecSpace = typename FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
+        >
+class ParallelFor ;
+
+/// \class ParallelReduce
+/// \brief Implementation detail of parallel_reduce.
+///
+/// This is an implementation detail of parallel_reduce.  Users should
+/// skip this and go directly to the nonmember function parallel_reduce.
+template< class FunctorType 
+        , class ExecPolicy 
+        , class ExecSpace = typename FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
+        >
+class ParallelReduce ;
+
+/// \class ParallelScan
+/// \brief Implementation detail of parallel_scan.
+///
+/// This is an implementation detail of parallel_scan.  Users should
+/// skip this and go directly to the documentation of the nonmember
+/// template function Kokkos::parallel_scan.
+template< class FunctorType 
+        , class ExecPolicy 
+        , class ExecSpace = typename FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
+        >
+class ParallelScan ;
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief Execute \c functor in parallel according to the execution \c policy.
+ *
+ * A "functor" is a class containing the function to execute in parallel,
+ * data needed for that execution, and an optional \c device_type
+ * typedef.  Here is an example functor for parallel_for:
+ *
+ * \code
+ *  class FunctorType {
+ *  public:
+ *    typedef  ...  device_type ;
+ *    void operator() ( WorkType iwork ) const ;
+ *  };
+ * \endcode
+ *
+ * In the above example, \c WorkType is any integer type for which a
+ * valid conversion from \c size_t to \c IntType exists.  Its
+ * <tt>operator()</tt> method defines the operation to parallelize,
+ * over the range of integer indices <tt>iwork=[0,work_count-1]</tt>.
+ * This compares to a single iteration \c iwork of a \c for loop.
+ * If \c device_type is not defined DefaultExecutionSpace will be used.
+ */
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_for( const ExecPolicy  & policy
+                 , const FunctorType & functor
+                 , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
+                 )
+{
+  (void) Impl::ParallelFor< FunctorType , ExecPolicy >( functor , policy );
+}
+
+template< class FunctorType >
+inline
+void parallel_for( const size_t        work_count ,
+                   const FunctorType & functor )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+  typedef RangePolicy< execution_space > policy ;
+  (void) Impl::ParallelFor< FunctorType , policy >( functor , policy(0,work_count) );
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Parallel reduction
+ *
+ * Example of a parallel_reduce functor for a POD (plain old data) value type:
+ * \code
+ *  class FunctorType { // For POD value type
+ *  public:
+ *    typedef    ...     device_type ;
+ *    typedef <podType>  value_type ;
+ *    void operator()( <intType> iwork , <podType> & update ) const ;
+ *    void init( <podType> & update ) const ;
+ *    void join( volatile       <podType> & update ,
+ *               volatile const <podType> & input ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> & update ) const ;
+ *  };
+ * \endcode
+ *
+ * Example of a parallel_reduce functor for an array of POD (plain old data) values:
+ * \code
+ *  class FunctorType { // For array of POD value
+ *  public:
+ *    typedef    ...     device_type ;
+ *    typedef <podType>  value_type[] ;
+ *    void operator()( <intType> , <podType> update[] ) const ;
+ *    void init( <podType> update[] ) const ;
+ *    void join( volatile       <podType> update[] ,
+ *               volatile const <podType> input[] ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> update[] ) const ;
+ *  };
+ * \endcode
+ */
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_reduce( const ExecPolicy  & policy 
+                    , const FunctorType & functor
+                    , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
+                    )
+{
+  (void) Impl::ParallelReduce< FunctorType , ExecPolicy >( functor , policy );
+}
+
+// integral range policy
+template< class FunctorType >
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorType & functor
+                    )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef RangePolicy< execution_space > policy ;
+
+  typedef Kokkos::Impl::ReduceAdapter< FunctorType >  Reduce ;
+
+  typedef typename Kokkos::Impl::if_c< Reduce::StaticValueSize
+                                     , typename Reduce::scalar_type
+                                     , typename Reduce::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , typename execution_space::host_mirror_device_type
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view ;
+
+  (void) Impl::ParallelReduce< FunctorType , policy >( functor , policy(0,work_count) , result_view );
+}
+
+// general policy and view ouput
+template< class ExecPolicy , class FunctorType , class ViewType >
+inline
+void parallel_reduce( const ExecPolicy  & policy 
+                    , const FunctorType & functor 
+                    , const ViewType    & result_view
+                    , typename Impl::enable_if<
+                      ( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
+                      )>::type * = 0 )
+{
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , policy , result_view );
+}
+
+// general policy and pod or array of pod output
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_reduce( const ExecPolicy  & policy 
+                    , const FunctorType & functor 
+                    , typename Impl::enable_if<
+                      ( ! Impl::is_integral< ExecPolicy >::value )
+                      , typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type
+                      >::type result_ref )
+{
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
+      execution_space ;
+
+  typedef Kokkos::Impl::ReduceAdapter< FunctorType >  Reduce ;
+
+  // Wrap the result output request in a view to inform the implementation
+  // of the type and memory space.
+
+  typedef typename Kokkos::Impl::if_c< Reduce::StaticValueSize
+                                     , typename Reduce::scalar_type
+                                     , typename Reduce::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , typename execution_space::host_mirror_device_type
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view( Reduce::pointer( result_ref )
+               , Reduce::value_count( functor )
+               );
+
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , policy , result_view );
+}
+
+// integral range policy and view ouput
+template< class FunctorType , class ViewType >
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorType & functor 
+                    , const ViewType    & result_view
+                    , typename Impl::enable_if<( Impl::is_view<ViewType>::value )>::type * = 0 )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef RangePolicy< execution_space > ExecPolicy ;
+
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , ExecPolicy(0,work_count) , result_view );
+}
+
+// integral range policy and pod or array of pod output
+template< class FunctorType >
+inline
+void parallel_reduce( const size_t        work_count ,
+                      const FunctorType & functor ,
+                      typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type result )
+{
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy< execution_space > policy ;
+
+  typedef Kokkos::Impl::ReduceAdapter< FunctorType >  Reduce ;
+
+  // Wrap the result output request in a view to inform the implementation
+  // of the type and memory space.
+
+  typedef typename Kokkos::Impl::if_c< Reduce::StaticValueSize
+                                     , typename Reduce::scalar_type
+                                     , typename Reduce::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , typename execution_space::host_mirror_device_type
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view( Reduce::pointer( result )
+               , Reduce::value_count( functor )
+               );
+  
+  (void) Impl::ParallelReduce< FunctorType , policy >( functor , policy(0,work_count) , result_view );
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/// \fn parallel_scan
+/// \tparam ExecutionPolicy The execution policy type.
+/// \tparam FunctorType     The scan functor type.
+///
+/// \param policy  [in] The execution policy.
+/// \param functor [in] The scan functor.
+///
+/// This function implements a parallel scan pattern.  The scan can
+/// be either inclusive or exclusive, depending on how you implement
+/// the scan functor.
+///
+/// A scan functor looks almost exactly like a reduce functor, except
+/// that its operator() takes a third \c bool argument, \c final_pass,
+/// which indicates whether this is the last pass of the scan
+/// operation.  We will show below how to use the \c final_pass
+/// argument to control whether the scan is inclusive or exclusive.
+///
+/// Here is the minimum required interface of a scan functor for a POD
+/// (plain old data) value type \c PodType.  That is, the result is a
+/// View of zero or more PodType.  It is also possible for the result
+/// to be an array of (same-sized) arrays of PodType, but we do not
+/// show the required interface for that here.
+/// \code
+/// template< class ExecPolicy , class FunctorType >
+/// class ScanFunctor {
+/// public:
+///   // The Kokkos device type
+///   typedef ... device_type;
+///   // Type of an entry of the array containing the result;
+///   // also the type of each of the entries combined using
+///   // operator() or join().
+///   typedef PodType value_type;
+///
+///   void operator () (const ExecPolicy::member_type & i, value_type& update, const bool final_pass) const;
+///   void init (value_type& update) const;
+///   void join (volatile value_type& update, volatile const value_type& input) const
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an inclusive plus-scan
+/// of an array of \c int, in place.  If given an array [1, 2, 3, 4], this
+/// scan will overwrite that array with [1, 3, 6, 10].
+///
+/// \code
+/// template<class DeviceType>
+/// class InclScanFunctor {
+/// public:
+///   typedef DeviceType device_type;
+///   typedef int value_type;
+///   typedef typename DeviceType::size_type size_type;
+///
+///   InclScanFunctor (Kokkos::View<value_type*, device_type> x) : x_ (x) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     update += x_(i);
+///     if (final_pass) {
+///       x_(i) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, device_type> x_;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an <i>exclusive</i>
+/// scan of an array of \c int, in place.  In operator(), note both
+/// that the final_pass test and the update have switched places, and
+/// the use of a temporary.  If given an array [1, 2, 3, 4], this scan
+/// will overwrite that array with [0, 1, 3, 6].
+///
+/// \code
+/// template<class DeviceType>
+/// class ExclScanFunctor {
+/// public:
+///   typedef DeviceType device_type;
+///   typedef int value_type;
+///   typedef typename DeviceType::size_type size_type;
+///
+///   ExclScanFunctor (Kokkos::View<value_type*, device_type> x) : x_ (x) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     const value_type x_i = x_(i);
+///     if (final_pass) {
+///       x_(i) = update;
+///     }
+///     update += x_i;
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, device_type> x_;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which builds on the above
+/// exclusive scan example, to compute an offsets array from a
+/// population count array, in place.  We assume that the pop count
+/// array has an extra entry at the end to store the final count.  If
+/// given an array [1, 2, 3, 4, 0], this scan will overwrite that
+/// array with [0, 1, 3, 6, 10].
+///
+/// \code
+/// template<class DeviceType>
+/// class OffsetScanFunctor {
+/// public:
+///   typedef DeviceType device_type;
+///   typedef int value_type;
+///   typedef typename DeviceType::size_type size_type;
+///
+///   // lastIndex_ is the last valid index (zero-based) of x.
+///   // If x has length zero, then lastIndex_ won't be used anyway.
+///   OffsetScanFunctor (Kokkos::View<value_type*, device_type> x) :
+///     x_ (x), last_index_ (x.dimension_0 () == 0 ? 0 : x.dimension_0 () - 1)
+///   {}
+///
+///   void operator () (const size_type i, int& update, const bool final_pass) const {
+///     const value_type x_i = x_(i);
+///     if (final_pass) {
+///       x_(i) = update;
+///     }
+///     update += x_i;
+///     // The last entry of x_ gets the final sum.
+///     if (final_pass && i == last_index_) {
+///       x_(i) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, device_type> x_;
+///   const size_type last_index_;
+/// };
+/// \endcode
+///
+template< class ExecutionPolicy , class FunctorType >
+inline
+void parallel_scan( const ExecutionPolicy & policy 
+                  , const FunctorType     & functor 
+                  , typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
+                  )
+{
+  Impl::ParallelScan< FunctorType , ExecutionPolicy > scan( functor , policy );
+}
+
+template< class FunctorType >
+inline
+void parallel_scan( const size_t        work_count ,
+                    const FunctorType & functor )
+{
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy< execution_space > policy ;
+
+  (void) Impl::ParallelScan< FunctorType , policy >( functor , policy(0,work_count) );
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Enable = void >
+struct FunctorHasInit : public false_type {};
+
+template< class FunctorType >
+struct FunctorHasInit< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::init ) >::type >
+  : public true_type {};
+
+template< class FunctorType , class Enable = void >
+struct FunctorHasJoin : public false_type {};
+
+template< class FunctorType >
+struct FunctorHasJoin< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::join ) >::type >
+  : public true_type {};
+
+template< class FunctorType , class Enable = void >
+struct FunctorHasFinal : public false_type {};
+
+template< class FunctorType >
+struct FunctorHasFinal< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::final ) >::type >
+  : public true_type {};
+
+template< class FunctorType , class Enable = void >
+struct FunctorTeamShmemSize
+{
+  static inline size_t value( const FunctorType & , int ) { return 0 ; }
+};
+
+template< class FunctorType >
+struct FunctorTeamShmemSize< FunctorType , typename enable_if< sizeof( & FunctorType::team_shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f , int team_size ) { return f.team_shmem_size( team_size ) ; }
+};
+
+template< class FunctorType >
+struct FunctorTeamShmemSize< FunctorType , typename enable_if< sizeof( & FunctorType::shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f , int team_size ) { return f.shmem_size( team_size ) ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+template< class FunctionPtr >
+struct ReduceAdapterFunctorOperatorArgType ;
+
+template< class Functor , class Arg0 , class Arg1 >
+struct ReduceAdapterFunctorOperatorArgType< void ( Functor::*)( Arg0 , Arg1 & ) const > {
+  typedef Arg1 type ;
+};
+
+// Functor does not have a 'typedef ... value_type' and C++11 is enabled.
+// Deduce the value type from the functor's argument list.
+template< class FunctorType , class Enable >
+struct ReduceAdapterValueType {
+private:
+  typedef decltype( & FunctorType::operator() ) function_pointer_type ;
+public:
+  typedef typename ReduceAdapterFunctorOperatorArgType< function_pointer_type >::type type ;
+};
+
+#endif
+
+template< class FunctorType , class ScalarType >
+struct ReduceAdapter
+{
+  enum { StaticValueSize = sizeof(ScalarType) };
+
+  typedef ScalarType & reference_type  ;
+  typedef ScalarType * pointer_type  ;
+  typedef ScalarType   scalar_type  ;
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type reference( void * p ) { return *((ScalarType*) p); }
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type reference( void * p , unsigned i ) { return ((ScalarType*) p)[i]; }
+
+  KOKKOS_INLINE_FUNCTION static
+  pointer_type pointer( reference_type p ) { return & p ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_count( const FunctorType & ) { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_size( const FunctorType & ) { return sizeof(ScalarType); }
+
+  KOKKOS_INLINE_FUNCTION static
+  void copy( const FunctorType & , void * const dst , const void * const src )
+    { *((scalar_type*)dst) = *((const scalar_type*)src); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void join( const F & f
+           , volatile void * update
+           , typename enable_if< is_same<F,FunctorType>::value &&
+                                 FunctorHasJoin<F>::value
+                               , volatile const void *
+                               >::type input )
+    { f.join( *((volatile ScalarType*)update) , *((volatile const ScalarType*)input) ); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void join( const F & f
+           , volatile void * update
+           , typename enable_if< is_same<F,FunctorType>::value &&
+                                 ! FunctorHasJoin<F>::value
+                               , volatile const void *
+                               >::type input )
+    { *((volatile ScalarType*)update) += *((volatile const ScalarType*)input); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  reference_type
+  init( const F & f ,
+        typename enable_if< ( is_same<F,FunctorType>::value &&
+                              FunctorHasInit<F>::value )
+                          >::type * p )
+    { f.init( *((ScalarType *) p ) );  return *((ScalarType *) p ); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  reference_type
+  init( const F & ,
+        typename enable_if< ( is_same<F,FunctorType>::value &&
+                              ! FunctorHasInit<F>::value )
+                          >::type * p )
+    { return *( new(p) ScalarType() ); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void final( const F & f ,
+              typename enable_if< ( is_same<F,FunctorType>::value &&
+                                    FunctorHasFinal<F>::value )
+                                >::type * p )
+    { f.final( *((ScalarType *) p ) ); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void final( const F & ,
+              typename enable_if< ( is_same<F,FunctorType>::value &&
+                                    ! FunctorHasFinal<F>::value )
+                                >::type * )
+    {}
+};
+
+template< class FunctorType , class ScalarType >
+struct ReduceAdapter< FunctorType , ScalarType[] >
+{
+  enum { StaticValueSize = 0 };
+
+  typedef ScalarType * reference_type  ;
+  typedef ScalarType * pointer_type  ;
+  typedef ScalarType   scalar_type  ;
+
+  KOKKOS_INLINE_FUNCTION static
+  ScalarType * reference( void * p ) { return (ScalarType*) p ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type reference( void * p , unsigned i ) { return ((ScalarType*) p)+i; }
+
+  KOKKOS_INLINE_FUNCTION static
+  pointer_type pointer( reference_type p ) { return p ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_count( const FunctorType & f ) { return f.value_count ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_size( const FunctorType & f ) { return f.value_count * sizeof(ScalarType); }
+
+  KOKKOS_INLINE_FUNCTION static
+  void copy( const FunctorType & f , void * const dst , const void * const src )
+    {
+      for ( int i = 0 ; i < int(f.value_count) ; ++i ) {
+        ((scalar_type*)dst)[i] = ((const scalar_type*)src)[i];
+      }
+    }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void join( const F & f
+           , volatile void * const update
+           , typename enable_if< is_same<F,FunctorType>::value &&
+                                 FunctorHasJoin<F>::value
+                               , volatile const void * const
+                               >::type input )
+    { f.join( ((volatile ScalarType*)update) , ((volatile const ScalarType*)input) ); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void join( const F & f
+           , volatile void * const update
+           , typename enable_if< is_same<F,FunctorType>::value &&
+                                 ! FunctorHasJoin<F>::value
+                               , volatile const void * const
+                               >::type input )
+    {
+      for ( int i = 0 ; i < int(f.value_count) ; ++i ) {
+        ((volatile ScalarType*)update)[i] += ((volatile const ScalarType*)input)[i] ;
+      }
+    }
+
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  reference_type
+  init( const F & f ,
+        typename enable_if< ( is_same<F,FunctorType>::value &&
+                              FunctorHasInit<F>::value )
+                          >::type * p )
+    { f.init( ((ScalarType *) p ) ); return (ScalarType*) p ; }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  reference_type
+  init( const F & f ,
+        typename enable_if< ( is_same<F,FunctorType>::value &&
+                              ! FunctorHasInit<F>::value )
+                          >::type * p )
+    {
+      for ( int i = 0 ; i < int(f.value_count) ; ++i ) {
+        new(((ScalarType*)p)+i) ScalarType();
+      }
+      return (ScalarType*)p ;
+    }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void final( const F & f ,
+              typename enable_if< ( is_same<F,FunctorType>::value &&
+                                    FunctorHasFinal<F>::value )
+                                >::type * p )
+    { f.final( ((ScalarType *) p ) ); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void final( const F & ,
+              typename enable_if< ( is_same<F,FunctorType>::value &&
+                                    ! FunctorHasFinal<F>::value )
+                                >::type * )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Qthread.hpp
+++ b/lib/kokkos/core/src/Kokkos_Qthread.hpp
@ -0,0 +1,158 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREAD_HPP
+#define KOKKOS_QTHREAD_HPP
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class QthreadExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Execution space supported by Qthread */
+class Qthread {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+  //! The tag (what type of kokkos_object is this).
+  typedef Impl::ExecutionSpaceTag  kokkos_tag ;
+
+  typedef Qthread                  device_type ;
+  typedef Qthread                  execution_space ;
+  typedef Kokkos::HostSpace        memory_space ;
+  typedef Qthread                  scratch_memory_space ;
+  typedef memory_space::size_type  size_type ;
+  typedef Kokkos::LayoutRight      array_layout ;
+  typedef Kokkos::Qthread          host_mirror_device_type ;
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  /** \brief  Initialization will construct one or more instances */
+  static Qthread & instance( int = 0 );
+
+  /** \brief  Set the execution space to a "sleep" state.
+   *
+   * This function sets the "sleep" state in which it is not ready for work.
+   * This may consume less resources than in an "ready" state,
+   * but it may also take time to transition to the "ready" state.
+   *
+   * \return True if enters or is in the "sleep" state.
+   *         False if functions are currently executing.
+   */
+  bool sleep();
+
+  /** \brief  Wake from the sleep state.
+   * 
+   *  \return True if enters or is in the "ready" state.
+   *          False if functions are currently executing.
+   */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functions to complete.
+   * 
+   *  The parallel_for or parallel_reduce dispatch of a functor may
+   *  return asynchronously, before the functor completes.  This
+   *  method does not return until all dispatched functors on this
+   *  device have completed.
+   */
+  static void fence();
+
+  /*------------------------------------------------------------------------*/
+
+  Qthread( Impl::QthreadExec & e ) : m_exec(e) {}
+
+  void * get_shmem( const int ) const ;
+
+  static int team_recommended();
+  static int team_max();
+
+  /*------------------------------------------------------------------------*/
+
+  static void initialize( int thread_count );
+  static void finalize();
+
+  /** \brief Print configuration information to the given output stream. */
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  int shepherd_size() const ;
+  int shepherd_worker_size() const ;
+
+private:
+
+  friend class Impl::QthreadExec ;
+
+  Impl::QthreadExec & m_exec ;
+
+};
+
+/*--------------------------------------------------------------------------*/
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <Kokkos_Parallel.hpp>
+#include <Qthread/Kokkos_QthreadExec.hpp>
+#include <Qthread/Kokkos_Qthread_Parallel.hpp>
+
+#endif /* #define KOKKOS_QTHREAD_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@ -0,0 +1,114 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SCRATCHSPACE_HPP
+#define KOKKOS_SCRATCHSPACE_HPP
+
+#include <stdio.h>
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Scratch memory space associated with an execution space.
+ *
+ */
+template< class ExecSpace >
+class ScratchMemorySpace {
+public:
+
+  // Alignment of memory chunks returned by 'get'
+  // must be a power of two
+  enum { ALIGN = 8 };
+
+private:
+
+  mutable char * m_iter ;
+  char *         m_end ;
+
+  ScratchMemorySpace();
+  ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
+
+  enum { MASK = ALIGN - 1 }; // Alignment used by View::shmem_size
+
+public:
+
+  typedef Impl::MemorySpaceTag              kokkos_tag ;
+  typedef ScratchMemorySpace                memory_space ;
+  typedef ExecSpace                         execution_space ;
+  typedef typename ExecSpace::array_layout  array_layout ;
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION static
+  IntType align( const IntType & size )
+    { return ( size + MASK ) & ~MASK ; }
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION
+  void * get_shmem( const IntType & size ) const
+    {
+      void * tmp = m_iter ;
+      if ( m_end < ( m_iter += align( size ) ) ) {
+        printf("ScratchMemorySpace<...>::get_shmem overflow %ld\n",long(m_end-m_iter));
+        tmp = 0 ;
+      }
+      return tmp ;
+    }
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION
+  ScratchMemorySpace( void * ptr , const IntType & size )
+    : m_iter( (char *) ptr )
+    , m_end(  m_iter + size )
+    {}
+};
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -0,0 +1,452 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Serial.hpp
+/// \brief Declaration and definition of Kokkos::Serial device.
+
+#ifndef KOKKOS_SERIAL_HPP
+#define KOKKOS_SERIAL_HPP
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class Serial
+/// \brief Kokkos device for non-parallel execution
+///
+/// A "device" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads device uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
+/// extensions, and the Cuda device uses NVIDIA's CUDA programming
+/// model.  The Serial device executes "parallel" kernels
+/// sequentially.  This is useful if you really do not want to use
+/// threads, or if you want to explore different combinations of MPI
+/// and shared-memory parallel programming models.
+class Serial {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! The tag (what type of kokkos_object is this).
+  typedef Impl::ExecutionSpaceTag  kokkos_tag ;
+
+  //! The device type (same as this class).
+  typedef Serial                device_type ;
+  typedef Serial                execution_space ;
+  //! The size_type typedef best suited for this device.
+  typedef HostSpace::size_type  size_type ;
+  //! This device's preferred memory space.
+  typedef HostSpace             memory_space ;
+  //! This device's preferred array layout.
+  typedef LayoutRight           array_layout ;
+  /// \brief This device's host mirror type.
+  ///
+  /// Serial is a host device, so the host mirror type is the same as
+  /// the device type itself.
+  typedef Serial                host_mirror_device_type ;
+
+  /// \brief  Scratch memory space
+  typedef ScratchMemorySpace< Kokkos::Serial >  scratch_memory_space ;
+
+  //@}
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  ///
+  /// For the Serial device, this method <i>always</i> returns false,
+  /// because parallel_for or parallel_reduce with the Serial device
+  /// always execute sequentially.
+  inline static int in_parallel() { return false ; }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence() {}
+
+  static void initialize( unsigned threads_count = 1 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false) {
+    (void) threads_count;
+    (void) use_numa_count;
+    (void) use_cores_per_numa;
+    (void) allow_asynchronous_threadpool;
+  }
+
+  static int is_initialized() { return 1 ; }
+
+  //! Free any resources being consumed by the device.
+  static void finalize() {}
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //--------------------------------------------------------------------------
+
+  inline static int thread_pool_size( int = 0 ) { return 1 ; }
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
+
+  //--------------------------------------------------------------------------
+
+  inline static unsigned hardware_thread_id() { return thread_pool_rank(); }
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+
+  static inline int team_max()         { return thread_pool_size(1) ; }
+  static inline int team_recommended() { return thread_pool_size(2); }
+
+  //--------------------------------------------------------------------------
+
+  static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
+
+  //--------------------------------------------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Serial::memory_space
+  , Kokkos::Serial::scratch_memory_space
+  >
+{
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+class SerialTeamMember {
+private:
+  typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
+  const scratch_memory_space  m_space ;
+  const int                   m_league_rank ;
+  const int                   m_league_size ;
+
+  SerialTeamMember & operator = ( const SerialTeamMember & );
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_space ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
+    {
+      const Type tmp = global_accum ? *global_accum : Type(0) ;
+      if ( global_accum ) { *global_accum += value ; }
+      return tmp ;
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
+    { return Type(0); }
+
+  //----------------------------------------
+  // Execution space specific:
+
+  SerialTeamMember( int arg_league_rank
+                  , int arg_league_size 
+                  , int arg_shared_size
+                  );
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+template < class WorkArgTag >
+class TeamPolicy< Kokkos::Serial , WorkArgTag > {
+private:
+
+  const int m_league_size ;
+
+public:
+
+  typedef Impl::ExecutionPolicyTag   kokkos_tag ;      ///< Concept tag
+  typedef Kokkos::Serial             execution_space ; ///< Execution space
+
+  inline int team_size() const { return 1 ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size_request , int /* team_size_request */ )
+    : m_league_size( league_size_request )
+    { }
+
+  TeamPolicy( int league_size_request , int /* team_size_request */ )
+    : m_league_size( league_size_request )
+    { }
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & ) { return 1 ; }
+
+  typedef Impl::SerialTeamMember  member_type ;
+};
+
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Kokkos::Serial , void , IntType , P >
+                 , Kokkos::Serial
+                 >
+{
+public:
+  typedef Kokkos::RangePolicy< Kokkos::Serial , void , IntType , P > Policy ;
+
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy )
+    {
+      const typename Policy::member_type e = policy.end();
+      for ( typename Policy::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( i );
+      }
+    }
+};
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Kokkos::Serial , void , IntType , P >
+                    , Kokkos::Serial
+                    >
+{
+public:
+  typedef Kokkos::RangePolicy< Kokkos::Serial , void , IntType , P > Policy ;
+
+  typedef ReduceAdapter< FunctorType >  Reduce ;
+  typedef typename Reduce::pointer_type pointer_type ;
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType  & functor
+                , const Policy       & policy
+                , const ViewType     & result
+                , const typename enable_if<
+                   ( is_view< ViewType >::value &&
+                     is_same< typename ViewType::memory_space , HostSpace >::value
+                   )>::type * = 0
+                )
+    {
+      pointer_type result_ptr = result.ptr_on_device();
+
+      if ( ! result_ptr ) {
+        result_ptr = (pointer_type)
+          Kokkos::Serial::scratch_memory_resize( Reduce::value_size( functor ) , 0 );
+      }
+
+      typename Reduce::reference_type update = Reduce::init( functor , result_ptr );
+      
+      const typename Policy::member_type e = policy.end();
+      for ( typename Policy::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( i , update );
+      }
+
+      Reduce::final( functor , result_ptr );
+    }
+};
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Kokkos::Serial , void , IntType , P >
+                  , Kokkos::Serial
+                  >
+{
+public:
+  typedef Kokkos::RangePolicy< Kokkos::Serial , void , IntType , P > Policy ;
+
+  typedef ReduceAdapter< FunctorType >  Reduce ;
+  typedef typename Reduce::pointer_type pointer_type ;
+
+  ParallelScan( const FunctorType  & functor
+               , const Policy      & policy
+               )
+    {
+      pointer_type result_ptr = (pointer_type)
+        Kokkos::Serial::scratch_memory_resize( Reduce::value_size( functor ) , 0 );
+
+      typename Reduce::reference_type update = Reduce::init( functor , result_ptr );
+      
+      const typename Policy::member_type e = policy.end();
+      for ( typename Policy::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( i , update , true );
+      }
+
+      Reduce::final( functor , result_ptr );
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Kokkos::Serial , void > , Kokkos::Serial >
+{
+public:
+  typedef Kokkos::TeamPolicy< Kokkos::Serial , void > Policy ;
+
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy )
+    {
+      const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+      Kokkos::Serial::scratch_memory_resize( 0 , shared_size );
+
+      for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
+        functor( typename Policy::member_type(ileague,policy.league_size(),shared_size) );
+      }
+    }
+};
+
+template< class FunctorType >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Kokkos::Serial , void > , Kokkos::Serial > {
+public:
+
+  typedef Kokkos::TeamPolicy< Kokkos::Serial , void > Policy ;
+  typedef ReduceAdapter< FunctorType >  Reduce ;
+  typedef typename Reduce::pointer_type pointer_type ;
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType  & functor
+                , const Policy       & policy
+                , const ViewType     & result
+                )
+    {
+      const int reduce_size = Reduce::value_size( functor );
+      const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+      void * const scratch_reduce = Kokkos::Serial::scratch_memory_resize( reduce_size , shared_size );
+
+      const pointer_type result_ptr =
+        result.ptr_on_device() ? result.ptr_on_device() 
+                               : (pointer_type) scratch_reduce ;
+
+      typename Reduce::reference_type update = Reduce::init( functor , result_ptr );
+      
+      for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
+        functor( typename Policy::member_type(ileague,policy.league_size(),shared_size) , update );
+      }
+
+      Reduce::final( functor , result_ptr );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_SERIAL_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
@ -0,0 +1,249 @@
+
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_TASKPOLICY_HPP
+#define KOKKOS_TASKPOLICY_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct FutureValueTypeIsVoidError {};
+
+template< class ExecSpace >
+class TaskManager ;
+
+template < class Policy , class ResultType = void , class Functor = void >
+class TaskMember ;
+
+template< class ExecPolicy >
+struct TaskDepends { typedef typename ExecPolicy::execution_space  execution_space ; };
+
+template< class ExecPolicy >
+struct TaskForEach { typedef typename ExecPolicy::execution_space  execution_space ; };
+
+template< class ExecPolicy >
+struct TaskReduce { typedef typename ExecPolicy::execution_space  execution_space ; };
+
+template< class ExecPolicy >
+struct TaskScan { typedef typename ExecPolicy::execution_space  execution_space ; };
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/**
+ *
+ *  Future< space >  // value_type == void
+ *  Future< value >  // space == Default
+ *  Future< value , space >
+ *
+ */
+template< class Arg1 = void , class Arg2 = void >
+class Future {
+private:
+
+  template< class > friend class Impl::TaskManager ;
+  template< class , class > friend class Future ;
+
+  // Argument #2, if not void, must be the space.
+  enum { Arg1_is_space  = Impl::is_execution_space< Arg1 >::value };
+  enum { Arg2_is_space  = Impl::is_execution_space< Arg2 >::value };
+  enum { Arg2_is_void   = Impl::is_same< Arg2 , void >::value };
+
+  struct ErrorNoExecutionSpace {};
+
+  enum { Opt1  =   Arg1_is_space && Arg2_is_void
+       , Opt2  = ! Arg1_is_space && Arg2_is_void
+       , Opt3  = ! Arg1_is_space && Arg2_is_space 
+       , OptOK = Impl::StaticAssert< Opt1 || Opt2 || Opt3 , ErrorNoExecutionSpace >::value
+       };
+
+  typedef typename
+    Impl::if_c< Opt2 || Opt3 , Arg1 , void >::type
+      ValueType ;
+
+  typedef typename
+    Impl::if_c< Opt1 , Arg1 , typename
+    Impl::if_c< Opt2 , Kokkos::DefaultExecutionSpace , typename
+    Impl::if_c< Opt3 , Arg2 , void
+    >::type >::type >::type
+      ExecutionSpace ;
+
+  typedef Impl::TaskManager< ExecutionSpace >              TaskManager ;
+  typedef Impl::TaskMember<  ExecutionSpace >              TaskRoot ;
+  typedef Impl::TaskMember<  ExecutionSpace , ValueType >  TaskValue ;
+
+  TaskRoot * m_task ;
+
+public:
+
+  typedef ValueType       value_type;
+  typedef ExecutionSpace  execution_space ;
+
+  //----------------------------------------
+
+  explicit
+  Future( TaskRoot * task )
+    : m_task(0)
+    { TaskManager::assign( & m_task , TaskValue::verify_type( task ) ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  ~Future() { TaskManager::assign( & m_task , 0 ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  Future() : m_task(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future & rhs )
+    : m_task(0)
+    { TaskManager::assign( & m_task , rhs.m_task ); }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future & rhs )
+    { TaskManager::assign( & m_task , rhs.m_task ); return *this ; }
+
+  //----------------------------------------
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future<A1,A2> & rhs )
+    : m_task(0)
+    { TaskManager::assign( & m_task , TaskValue::verify_type( rhs.m_task ) ); }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future<A1,A2> & rhs )
+    { TaskManager::assign( & m_task , TaskValue::verify_type( rhs.m_task ) ); return *this ; }
+
+  //----------------------------------------
+
+  typedef typename TaskValue::get_result_type get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  typename TaskValue::get_result_type get() const
+    { return static_cast<TaskValue*>( m_task )->get(); }
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class Policy = Kokkos::DefaultExecutionSpace >
+class TaskPolicy {
+public:
+
+  typedef typename Policy::execution_space  execution_space ;
+
+  template< class A1 , class A2 >
+  void wait( const Future<A1,A2> & ) const ;
+
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  spawn( const FunctorType & ) const ;
+
+  template< class FunctorType >
+  void respawn( FunctorType * ) const ;
+
+  template< class FunctorType >
+  Future< void , execution_space >
+  get_dependence( FunctorType * ) const ;
+
+  template< class ValueType >
+  TaskPolicy< void /* ... */ >
+  depends( const Future< ValueType , execution_space > * const , const int );
+
+  template< class ExecPolicy >
+  TaskPolicy< void /* ... */ > foreach( const ExecPolicy & );
+
+  template< class ExecPolicy >
+  TaskPolicy< void /* ... */ > reduce( const ExecPolicy & );
+
+  template< class ExecPolicy >
+  TaskPolicy< void /* ... */ > scan( const ExecPolicy & );
+};
+
+// spawn( M.depends(n,d).foreach(K) , functor );
+// M.depends(n,d).foreach(K).spawn( functor );
+
+template< class PolicyType , class FunctorType >
+Future< typename FunctorType::value_type
+      , typename PolicyType::execution_space >
+inline
+spawn( const TaskPolicy< PolicyType > & policy
+     , const FunctorType              & functor )
+{ return policy.spawn( functor ); }
+
+template< class PolicyType , class A1 , class A2 >
+void wait( const TaskPolicy< PolicyType > & policy 
+         , const Future<A1,A2>            & future
+         , typename Impl::enable_if<
+             Impl::is_same< typename PolicyType::execution_space
+                          , typename Future<A1,A2>::execution_space >::value
+          >::type * = 0 )
+{ policy.wait( future ); }
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_TASKPOLICY_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@ -0,0 +1,222 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_HPP
+#define KOKKOS_THREADS_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class ThreadsExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Device for a pool of Pthreads or C11 threads on a CPU. */
+class Threads {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+  //! The tag (what type of kokkos_object is this).
+  typedef Impl::ExecutionSpaceTag  kokkos_tag ;
+
+  typedef Threads                  device_type ;
+  typedef Threads                  execution_space ;
+  typedef Kokkos::HostSpace        memory_space ;
+  typedef memory_space::size_type  size_type ;
+  typedef Kokkos::LayoutRight      array_layout ;
+  typedef Kokkos::Threads          host_mirror_device_type ;
+
+  typedef ScratchMemorySpace< Threads >  scratch_memory_space ;
+  //@}
+  /*------------------------------------------------------------------------*/
+  //! \name Static functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  static int in_parallel();
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  /// \brief Free any resources being consumed by the device.
+  ///
+  /// For the Threads device, this terminates spawned worker threads.
+  static void finalize();
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  /*------------------------------------------------------------------------*/
+  //! \name Device-specific functions
+  //@{
+
+  /** \brief Initialize the device in the "ready to work" state.
+   *
+   *  The device is initialized in a "ready to work" or "awake" state.
+   *  This state reduces latency and thus improves performance when
+   *  dispatching work.  However, the "awake" state consumes resources
+   *  even when no work is being done.  You may call sleep() to put
+   *  the device in a "sleeping" state that does not consume as many
+   *  resources, but it will take time (latency) to awaken the device
+   *  again (via the wake()) method so that it is ready for work.
+   *
+   *  Teams of threads are distributed as evenly as possible across
+   *  the requested number of numa regions and cores per numa region.
+   *  A team will not be split across a numa region.
+   *
+   *  If the 'use_' arguments are not supplied the hwloc is queried
+   *  to use all available cores.
+   */
+  static void initialize( unsigned threads_count = 1 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false );
+
+  static int is_initialized();
+
+  static Threads & instance( int = 0 );
+
+  //----------------------------------------
+  /** \brief  Maximum size of a single thread team.
+   *
+   *  If a parallel_{for,reduce,scan} operation requests a team_size that
+   *  does not satisfy the condition: 0 == team_max() % team_size
+   *  then some threads will idle.
+   */
+
+   static int thread_pool_size( int depth = 0 );
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+   static int thread_pool_rank();
+#else
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
+#endif
+
+  inline static unsigned team_recommended() { return thread_pool_size(2); }
+  inline static unsigned team_max()         { return thread_pool_size(1); }
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
+
+  //@}
+  //----------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Threads::memory_space
+  , Kokkos::Threads::scratch_memory_space
+  >
+{
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Threads/Kokkos_ThreadsExec.hpp>
+#include <Threads/Kokkos_Threads_Parallel.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+#endif /* #define KOKKOS_THREADS_HPP */
+
+
--- a/lib/kokkos/core/src/Kokkos_Vectorization.hpp
+++ b/lib/kokkos/core/src/Kokkos_Vectorization.hpp
@ -0,0 +1,100 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Vectorization.hpp
+/// \brief Declaration and definition of Kokkos::Vectorization interface.
+#ifndef KOKKOS_VECTORIZATION_HPP
+#define KOKKOS_VECTORIZATION_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+
+namespace Kokkos {
+template<class Device, int N>
+struct Vectorization {
+  typedef Kokkos::TeamPolicy< Device >       team_policy ;
+  typedef typename team_policy::member_type  team_member ;
+
+  enum {increment = 1};
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int begin() { return 0;}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int thread_rank(const team_member &dev) {
+    return dev.team_rank();
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int team_rank(const team_member &dev) {
+    return dev.team_rank()/increment;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int team_size(const team_member &dev) {
+    return dev.team_size()/increment;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int global_thread_rank(const team_member &dev) {
+    return (dev.league_rank()*dev.team_size()+dev.team_rank());
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static bool is_lane_0(const team_member &dev) {
+    return true;
+  }
+
+  template<class Scalar>
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar reduce(const Scalar& val) {
+    return val;
+  }
+};
+}
+
+#if defined( KOKKOS_HAVE_CUDA )
+#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+#endif
+
+#endif
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
--- a/lib/kokkos/core/src/Kokkos_hwloc.hpp
+++ b/lib/kokkos/core/src/Kokkos_hwloc.hpp
@ -0,0 +1,140 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HWLOC_HPP
+#define KOKKOS_HWLOC_HPP
+
+#include <utility>
+
+namespace Kokkos {
+
+/** \brief  Minimal subset of logical 'hwloc' functionality available
+ *          from http://www.open-mpi.org/projects/hwloc/.
+ *
+ *  The calls are NOT thread safe in order to avoid mutexes,
+ *  memory allocations, or other actions which could give the
+ *  runtime system an opportunity to migrate the threads or
+ *  touch allocated memory during the function calls.
+ *
+ *  All calls to these functions should be performed by a thread
+ *  when it has guaranteed exclusive access; e.g., for OpenMP
+ *  within a 'critical' region.
+ */
+namespace hwloc {
+
+/** \brief  Query if hwloc is available */
+bool available();
+
+/** \brief  Query number of available NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a NUMA region.
+ */
+unsigned get_available_numa_count();
+
+/** \brief  Query number of available cores per NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a set of cores.
+ */
+unsigned get_available_cores_per_numa();
+
+/** \brief  Query number of available "hard" threads per core; i.e., hyperthreads */
+unsigned get_available_threads_per_core();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Internal functions for binding persistent spawned threads.
+
+namespace Kokkos {
+namespace hwloc {
+
+/** \brief  Recommend mapping of threads onto cores.
+ *
+ * If thread_count == 0 then choose and set a value.
+ * If use_numa_count == 0 then choose and set a value.
+ * If use_cores_per_numa == 0 then choose and set a value.
+ *
+ * Return 0 if asynchronous,
+ * Return 1 if synchronous and threads_coord[0] is process core
+ */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] );
+
+/** \brief  Query core-coordinate of the current thread
+ *          with respect to the core_topology.
+ *
+ *  As long as the thread is running within the 
+ *  process binding the following condition holds.
+ *
+ *  core_coordinate.first  < core_topology.first
+ *  core_coordinate.second < core_topology.second
+ */
+std::pair<unsigned,unsigned> get_this_thread_coordinate();
+
+/** \brief  Bind the current thread to a core. */
+bool bind_this_thread( const std::pair<unsigned,unsigned> );
+
+/** \brief  Bind the current thread to one of the cores in the list.
+ *          Set that entry to (~0,~0) and return the index.
+ *          If binding fails return ~0.
+ */
+unsigned bind_this_thread( const unsigned               coordinate_count ,
+                           std::pair<unsigned,unsigned> coordinate[] );
+
+/** \brief  Unbind the current thread back to the original process binding */
+bool unbind_this_thread();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_HWLOC_HPP */
+
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@ -0,0 +1,373 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_PARALLEL_HPP
+#define KOKKOS_OPENMP_PARALLEL_HPP
+
+#include <omp.h>
+
+#include <Kokkos_Parallel.hpp>
+#include <OpenMP/Kokkos_OpenMPexec.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Kokkos::OpenMP , void , IntType , P >
+                 , Kokkos::OpenMP
+                 >
+{
+public:
+  typedef Kokkos::RangePolicy< Kokkos::OpenMP , void , IntType , P > Policy ;
+
+  inline
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      const Policy range( policy , exec.pool_rank() , exec.pool_size() );
+
+      const typename Policy::member_type work_end = range.end();
+      for ( typename Policy::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( iwork );
+      }
+    }
+/* END #pragma omp parallel */
+  }
+};
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Kokkos::OpenMP , void , IntType , P >
+                    , Kokkos::OpenMP
+                    >
+{
+public:
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+  typedef Kokkos::RangePolicy< Kokkos::OpenMP , void , IntType , P > Policy ;
+
+  template< class HostViewType >
+  inline
+  ParallelReduce( const FunctorType  & functor
+                , const Policy       & policy
+                , const HostViewType & result_view )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+
+    OpenMPexec::resize_scratch( Reduce::value_size( functor ) , 0 );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      typename Reduce::reference_type update = Reduce::init( functor , exec.scratch_reduce() );
+
+      const Policy range( policy , exec.pool_rank() , exec.pool_size() );
+
+      const typename Policy::member_type work_end = range.end();
+      for ( typename Policy::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( iwork , update );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        Reduce::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Reduce::final( functor , ptr );
+
+      if ( result_view.ptr_on_device() ) {
+        const int n = Reduce::value_count( functor );
+
+        for ( int j = 0 ; j < n ; ++j ) { result_view.ptr_on_device()[j] = ptr[j] ; }
+      }
+    }
+  }
+
+  void wait() {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Kokkos::OpenMP , void , IntType , P >
+                  , Kokkos::OpenMP
+                  >
+{
+public:
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+  typedef Kokkos::RangePolicy< Kokkos::OpenMP , void , IntType , P > Policy ;
+
+  inline
+  ParallelScan( const FunctorType & functor , const Policy & policy )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
+
+    OpenMPexec::resize_scratch( 2 * Reduce::value_size( functor ) , 0 );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      const Policy range( policy , exec.pool_rank() , exec.pool_size() );
+
+      typename Reduce::reference_type update =
+        Reduce::init( functor ,
+                      pointer_type( exec.scratch_reduce() ) + Reduce::value_count( functor ) );
+
+      const typename Policy::member_type work_end = range.end();
+      for ( typename Policy::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( iwork , update , false );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      const unsigned thread_count = OpenMPexec::pool_size();
+      const unsigned value_count  = Reduce::value_count( functor );
+
+      pointer_type ptr_prev = 0 ;
+
+      for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
+
+        pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
+
+        if ( ptr_prev ) {
+          for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
+          Reduce::join( functor , ptr + value_count , ptr );
+        }
+        else {
+          Reduce::init( functor , ptr );
+        }
+
+        ptr_prev = ptr ;
+      }
+    }
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      const Policy range( policy , exec.pool_rank() , exec.pool_size() );
+
+      typename Reduce::reference_type update =
+        Reduce::reference( pointer_type( exec.scratch_reduce() ) );
+
+      const typename Policy::member_type work_end = range.end();
+      for ( typename Policy::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( iwork , update , true );
+      }
+    }
+/* END #pragma omp parallel */
+
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Kokkos::OpenMP , void > , Kokkos::OpenMP >
+{
+public:
+  typedef Kokkos::TeamPolicy< Kokkos::OpenMP , void > Policy ;
+
+  inline
+  ParallelFor( const FunctorType & functor ,
+               const Policy      & policy )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+    OpenMPexec::resize_scratch( 0 , team_reduce_size + team_shmem_size );
+
+#pragma omp parallel
+    {
+      typename Policy::member_type member( * OpenMPexec::get_thread_omp() , policy , team_shmem_size );;
+
+      for ( ; member.valid() ; member.next() ) {
+        functor( member );
+      }
+    }
+/* END #pragma omp parallel */
+  }
+
+  void wait() {}
+};
+
+template< class FunctorType >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Kokkos::OpenMP , void > , Kokkos::OpenMP >
+{
+public:
+  typedef Kokkos::TeamPolicy< Kokkos::OpenMP , void > Policy ;
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+
+  inline
+  ParallelReduce( const FunctorType  & functor ,
+                  const Policy       & policy )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+
+    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+    OpenMPexec::resize_scratch( Reduce::value_size( functor ) , team_reduce_size + team_shmem_size );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      typename Reduce::reference_type update = Reduce::init( functor , exec.scratch_reduce() );
+
+      for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
+        functor( member , update );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        Reduce::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Reduce::final( functor , ptr );
+    }
+  }
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & functor ,
+                  const Policy       & policy ,
+                  const ViewType     & result )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+
+    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+    OpenMPexec::resize_scratch( Reduce::value_size( functor ) , team_reduce_size + team_shmem_size );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      typename Reduce::reference_type update = Reduce::init( functor , exec.scratch_reduce() );
+
+      for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
+        functor( member , update );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        Reduce::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Reduce::final( functor , ptr );
+
+      const int n = Reduce::value_count( functor );
+
+      for ( int j = 0 ; j < n ; ++j ) { result.ptr_on_device()[j] = ptr[j] ; }
+    }
+  }
+
+  void wait() {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@ -0,0 +1,352 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <limits>
+#include <iostream>
+#include <Kokkos_OpenMP.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <iostream>
+
+#ifdef KOKKOS_HAVE_OPENMP
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel();
+
+int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel()
+{
+#ifndef __CUDA_ARCH__
+  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
+#else
+  return 0;
+#endif
+}
+
+bool s_using_hwloc = false;
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+namespace Impl {
+
+int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+
+int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
+
+OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+
+void OpenMPexec::verify_is_process( const char * const label )
+{
+  if ( omp_in_parallel() ) {
+    std::string msg( label );
+    msg.append( " ERROR: in parallel" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPexec::verify_initialized( const char * const label )
+{
+  if ( 0 == m_pool[0] ) {
+    std::string msg( label );
+    msg.append( " ERROR: not initialized" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPexec::clear_scratch()
+{
+#pragma omp parallel
+  {
+    const int rank_rev = m_map_rank[ omp_get_thread_num() ];
+
+#pragma omp critical
+    {
+      kokkos_omp_in_critical_region = 1 ;
+
+      m_pool[ rank_rev ]->~OpenMPexec();
+      HostSpace::decrement( m_pool[ rank_rev ] );
+      m_pool[ rank_rev ] = 0 ;
+
+      kokkos_omp_in_critical_region = 0 ;
+    }
+/* END #pragma omp critical */
+  }
+/* END #pragma omp parallel */
+}
+
+void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
+{
+  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
+  enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
+
+  const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
+  const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
+
+  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+
+  // Requesting allocation and old allocation is too small:
+
+  const bool allocate = ( old_reduce_size < reduce_size ) ||
+                        ( old_thread_size < thread_size );
+
+  if ( allocate ) {
+    if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
+    if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
+  }
+
+  const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
+  const int    pool_size  = m_pool_topo[0] ;
+
+  if ( allocate ) {
+
+    clear_scratch();
+
+#pragma omp parallel
+    {
+      const int rank_rev = m_map_rank[ omp_get_thread_num() ];
+      const int rank     = pool_size - ( rank_rev + 1 );
+
+#pragma omp critical
+      {
+        kokkos_omp_in_critical_region = 1 ;
+
+        m_pool[ rank_rev ] =
+          (OpenMPexec *) HostSpace::allocate( "openmp_scratch" , typeid(unsigned char) , 1 , alloc_size );
+        new( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
+
+        kokkos_omp_in_critical_region = 0 ;
+      }
+/* END #pragma omp critical */
+    }
+/* END #pragma omp parallel */
+  }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+int OpenMP::is_initialized()
+{ return 0 != Impl::OpenMPexec::m_pool[0]; }
+
+void OpenMP::initialize( unsigned thread_count ,
+                         unsigned use_numa_count ,
+                         unsigned use_cores_per_numa )
+{
+  if(thread_count==0) thread_count = omp_get_max_threads();
+  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
+
+  bool thread_spawn_failed = false ;
+
+  if ( ! is_initialized ) {
+
+    // Use hwloc thread pinning if concerned with locality.
+    // If spreading threads across multiple NUMA regions.
+    // If hyperthreading is enabled.
+    Impl::s_using_hwloc = hwloc::available() && (
+                            ( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
+                            ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
+
+    std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
+
+    if(Impl::s_using_hwloc)
+      hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
+                           false /* do not allow asynchronous */ ,
+                           thread_count ,
+                           use_numa_count ,
+                           use_cores_per_numa ,
+                           threads_coord );
+
+    // Spawn threads:
+
+    omp_set_num_threads( thread_count );
+
+    // Verify OMP interaction:
+    if ( int(thread_count) != omp_get_max_threads() ) {
+      thread_spawn_failed = true ;
+    }
+
+    // Verify spawning and bind threads:
+#pragma omp parallel
+    {
+#pragma omp critical
+      {
+        if ( int(thread_count) != omp_get_num_threads() ) {
+          thread_spawn_failed = true ;
+        }
+
+        // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
+        // Call to 'new' may not be thread safe as well.
+
+        // Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
+
+        const unsigned omp_rank    = omp_get_thread_num();
+        const unsigned thread_r    = Impl::s_using_hwloc ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) : omp_rank ;
+
+        Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
+      }
+/* END #pragma omp critical */
+    }
+/* END #pragma omp parallel */
+
+    if ( ! thread_spawn_failed ) {
+      Impl::OpenMPexec::m_pool_topo[0] = thread_count ;
+      Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
+      Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
+
+      Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+    std::string msg("Kokkos::OpenMP::initialize ERROR");
+
+    if ( is_initialized ) { msg.append(" : already initialized"); }
+    if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }
+
+    Kokkos::Impl::throw_runtime_exception(msg);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::finalize()
+{
+  Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
+  Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
+
+  Impl::OpenMPexec::clear_scratch();
+
+  Impl::OpenMPexec::m_pool_topo[0] = 0 ;
+  Impl::OpenMPexec::m_pool_topo[1] = 0 ;
+  Impl::OpenMPexec::m_pool_topo[2] = 0 ;
+
+  omp_set_num_threads(0);
+
+  if ( Impl::s_using_hwloc ) {
+    hwloc::unbind_this_thread();
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::print_configuration( std::ostream & s , const bool detail )
+{
+  Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" );
+
+  s << "Kokkos::OpenMP" ;
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  s << " KOKKOS_HAVE_OPENMP" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+
+  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]"
+    << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
+    ;
+#endif
+
+  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
+
+  if ( is_initialized ) {
+    const int numa_count      = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ;
+    const int core_per_numa   = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
+    const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
+
+    s << " thread_pool_topology[ " << numa_count
+      << " x " << core_per_numa
+      << " x " << thread_per_core
+      << " ]"
+      << std::endl ;
+
+    if ( detail ) {
+      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] );
+
+#pragma omp parallel
+      {
+#pragma omp critical
+        {
+          coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
+        }
+/* END #pragma omp critical */
+      }
+/* END #pragma omp parallel */
+
+      for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
+        s << "  thread omp_rank[" << i << "]"
+          << " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]"
+          << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
+          << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+} // namespace Kokkos
+
+#endif //KOKKOS_HAVE_OPENMP
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@ -0,0 +1,426 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPEXEC_HPP
+#define KOKKOS_OPENMPEXEC_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Data for OpenMP thread execution */
+
+class OpenMPexec {
+public:
+
+  enum { MAX_THREAD_COUNT = 4096 };
+
+private:
+
+  static int          m_pool_topo[ 4 ];
+  static int          m_map_rank[ MAX_THREAD_COUNT ];
+  static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
+
+  friend class Kokkos::OpenMP ;
+
+  int const  m_pool_rank ;
+  int const  m_pool_rank_rev ;
+  int const  m_scratch_exec_end ;
+  int const  m_scratch_reduce_end ;
+  int const  m_scratch_thread_end ;
+
+  int volatile  m_barrier_state ;
+
+  OpenMPexec();
+  OpenMPexec( const OpenMPexec & );
+  OpenMPexec & operator = ( const OpenMPexec & );
+
+  static void clear_scratch();
+
+public:
+
+  // Topology of a cache coherent thread pool:
+  //   TOTAL = NUMA x GRAIN
+  //   pool_size( depth = 0 )
+  //   pool_size(0) = total number of threads
+  //   pool_size(1) = number of threads per NUMA
+  //   pool_size(2) = number of threads sharing finest grain memory hierarchy
+
+  inline static
+  int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
+
+  inline static
+  OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
+
+  inline int pool_rank() const { return m_pool_rank ; }
+  inline int pool_rank_rev() const { return m_pool_rank_rev ; }
+
+  inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
+  inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
+
+  inline
+  void state_wait( int state )
+    { Impl::spinwait( m_barrier_state , state ); }
+
+  inline
+  void state_set( int state ) { m_barrier_state = state ; }
+
+  ~OpenMPexec() {}
+
+  OpenMPexec( const int poolRank 
+            , const int scratch_exec_size
+            , const int scratch_reduce_size
+            , const int scratch_thread_size )
+    : m_pool_rank( poolRank )
+    , m_pool_rank_rev( pool_size() - ( poolRank + 1 ) )
+    , m_scratch_exec_end( scratch_exec_size )
+    , m_scratch_reduce_end( m_scratch_exec_end   + scratch_reduce_size )
+    , m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size )
+    , m_barrier_state(0)
+    {}
+
+  static void finalize();
+
+  static void initialize( const unsigned  team_count ,
+                          const unsigned threads_per_team ,
+                          const unsigned numa_count ,
+                          const unsigned cores_per_numa );
+
+  static void verify_is_process( const char * const );
+  static void verify_initialized( const char * const );
+
+  static void resize_scratch( size_t reduce_size , size_t thread_size );
+
+  inline static
+  OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class OpenMPexecTeamMember {
+private:
+
+  enum { TEAM_REDUCE_SIZE = 16 };
+
+  /** \brief  Thread states for team synchronization */
+  enum { Active = 0 , Rendezvous = 1 };
+
+  typedef Kokkos::OpenMP                         execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  Impl::OpenMPexec    & m_exec ;
+  scratch_memory_space  m_team_shared ;
+  int                   m_team_shmem ;
+  int                   m_team_base_rev ;
+  int                   m_team_rank_rev ;
+  int                   m_team_rank ;
+  int                   m_team_size ;
+  int                   m_league_rank ;
+  int                   m_league_end ;
+  int                   m_league_size ;
+
+  // Fan-in team threads, root of the fan-in which does not block returns true
+  inline
+  bool team_fan_in() const
+    {
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
+      }
+
+      if ( m_team_rank_rev ) {
+        m_exec.state_set( Rendezvous );
+        m_exec.state_wait( Rendezvous );
+      }
+
+      return 0 == m_team_rank_rev ;
+    }
+
+  inline
+  void team_fan_out() const
+    {
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+        m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
+      }
+    }
+
+public:
+
+  inline
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  inline void team_barrier() const
+    {
+      if ( 1 < m_team_size ) {
+        team_fan_in();
+        team_fan_out();
+      }
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  inline ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
+
+      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+          const type offset = accum ;  
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  inline Type team_scan( const Type & value ) const
+    { return this-> template team_scan<Type>( value , 0 ); }
+
+  //----------------------------------------
+  // Private for the driver
+
+private:
+
+  typedef execution_space::scratch_memory_space space ;
+
+public:
+
+  template< class WorkArgTag >
+  inline
+  OpenMPexecTeamMember( Impl::OpenMPexec & exec
+                      , const TeamPolicy< execution_space , WorkArgTag > & team
+                      , const int shmem_size
+                      )
+    : m_exec( exec )
+    , m_team_shared(0,0)
+    , m_team_shmem( shmem_size )
+    , m_team_base_rev(0)
+    , m_team_rank_rev(0)
+    , m_team_rank(0)
+    , m_team_size( team.team_size() )
+    , m_league_rank(0)
+    , m_league_end(0)
+    , m_league_size( team.league_size() )
+    {
+      const int pool_rank_rev        = m_exec.pool_rank_rev();
+      const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
+      const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
+      const int league_iter_end      = team.league_size() - pool_league_rank_rev * team.team_iter();
+
+      if ( pool_team_rank_rev < m_team_size && 0 < league_iter_end ) {
+        m_team_base_rev  = team.team_alloc() * pool_league_rank_rev ;
+        m_team_rank_rev  = pool_team_rank_rev ;
+        m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
+        m_league_end     = league_iter_end ;
+        m_league_rank    = league_iter_end > team.team_iter() ? league_iter_end - team.team_iter() : 0 ;
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+      }
+    }
+
+  bool valid() const
+    { return m_league_rank < m_league_end ; }
+
+  void next()
+    {
+      if ( ++m_league_rank < m_league_end ) {
+        team_barrier();
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+      }
+    }
+
+  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template < class WorkArgTag >
+class TeamPolicy< Kokkos::OpenMP , WorkArgTag > {
+public:
+
+  typedef Impl::ExecutionPolicyTag   kokkos_tag ;      ///< Concept tag
+  typedef Kokkos::OpenMP             execution_space ; ///< Execution space
+
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  inline void init( const int league_size_request
+                  , const int team_size_request )
+    {
+      const int pool_size  = execution_space::thread_pool_size(0);
+      const int team_max   = execution_space::thread_pool_size(1);
+      const int team_grain = execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+
+      // Maxumum number of iterations each team will take:
+      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
+    }
+
+public:
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size_request , int team_size_request )
+    { init( league_size_request , team_size_request ); }
+
+  TeamPolicy( int league_size_request , int team_size_request )
+    { init( league_size_request , team_size_request ); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return execution_space::thread_pool_size(1); }
+
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int team_iter()  const { return m_team_iter ; }
+
+  typedef Impl::OpenMPexecTeamMember member_type ;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline
+int OpenMP::thread_pool_size( int depth )
+{
+  return Impl::OpenMPexec::pool_size(depth);
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMP::thread_pool_rank()
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ];
+#else
+  return -1 ;
+#endif
+}
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
+
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
@ -0,0 +1,397 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_HAVE_QTHREAD )
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <utility>
+#include <Kokkos_Qthread.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#define QTHREAD_LOCAL_PRIORITY
+
+#include <qthread/qthread.h>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
+
+/** s_exec is indexed by the reverse rank of the workers
+ *  for faster fan-in / fan-out lookups
+ *  [ n - 1 , n - 2 , ... , 0 ]
+ */
+QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
+
+int  s_number_shepherds            = 0 ;
+int  s_number_workers_per_shepherd = 0 ;
+int  s_number_workers              = 0 ;
+
+inline
+QthreadExec ** worker_exec()
+{
+  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
+}
+
+const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
+
+int s_worker_reduce_end   = 0 ; /* End of worker reduction memory    */
+int s_worker_shared_end   = 0 ; /* Total of worker scratch memory    */
+int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
+
+QthreadExecFunctionPointer s_active_function = 0 ;
+const void               * s_active_function_arg = 0 ;
+
+} /* namespace */
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+void Qthread::initialize( int thread_count )
+{
+  // Environment variable: QTHREAD_NUM_SHEPHERDS
+  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
+  // Environment variable: QTHREAD_HWPAR
+
+  {
+    char buffer[256];
+    snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
+    putenv(buffer);
+  }
+
+  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
+                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
+                       ( thread_count    == qthread_num_workers() );
+
+  bool ok_symmetry = true ;
+
+  if ( ok_init ) {
+    Impl::s_number_shepherds            = qthread_num_shepherds();
+    Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
+    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
+
+    for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
+      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
+    }
+  }
+
+  if ( ! ok_init || ! ok_symmetry ) {
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
+    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
+    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
+    msg << " : qthread_num_workers = " << qthread_num_workers();
+
+    if ( ! ok_symmetry ) {
+      msg << " : qthread_num_workers_local = {" ;
+      for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
+        msg << " " << qthread_num_workers_local(i) ;
+      }
+      msg << " }" ;
+    }
+
+    Impl::s_number_workers   = 0 ;
+    Impl::s_number_shepherds = 0 ;
+    Impl::s_number_workers_per_shepherd = 0 ;
+
+    if ( ok_init ) { qthread_finalize(); }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
+}
+
+void Qthread::finalize()
+{
+  Impl::QthreadExec::clear_workers();
+
+  if ( Impl::s_number_workers ) {
+    qthread_finalize();
+  }
+
+  Impl::s_number_workers    = 0 ;
+  Impl::s_number_shepherds  = 0 ;
+  Impl::s_number_workers_per_shepherd = 0 ;
+}
+
+void Qthread::print_configuration( std::ostream & s , const bool detail )
+{
+  s << "Kokkos::Qthread {"
+    << " num_shepherds(" << Impl::s_number_shepherds << ")"
+    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
+    << " }" << std::endl ;
+}
+
+Qthread & Qthread::instance( int )
+{
+  static Impl::QthreadExec * e = 0 ;
+  static Qthread q( *e );
+  return q ;
+}
+
+void Qthread::fence()
+{
+}
+
+int Qthread::team_recommended()
+{ return Impl::s_number_workers_per_shepherd ; }
+
+int Qthread::team_max()
+{ return Impl::s_number_workers_per_shepherd ; }
+
+void * Qthread::get_shmem( const int size ) const
+{
+  enum { WORD_MASK = sizeof(int) - 1 }; // Round to word size
+
+  // Root worker for the shepherd
+
+  const int offset = m_exec.m_shared_iter ;
+
+  m_exec.m_shared_iter += ( size + WORD_MASK ) & ~WORD_MASK ;
+
+  if ( Impl::s_worker_shared_end < m_exec.m_shared_iter ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("QthreadExec::get_shmem FAILED : exceeded shared memory size" ) );
+  }
+
+  return ((unsigned char *) (**m_exec.m_shepherd_base).m_scratch_alloc ) + offset ;
+}
+
+int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
+int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+aligned_t driver_exec_all( void * arg )
+{
+  (*s_active_function)( ** worker_exec() , s_active_function_arg );
+
+  return 0 ;
+}
+
+aligned_t driver_resize_worker_scratch( void * arg )
+{
+  static volatile int lock_begin = 0 ;
+  static volatile int lock_end   = 0 ;
+
+  QthreadExec ** const exec = worker_exec();
+
+  //----------------------------------------
+  // Serialize allocation for thread safety
+
+  while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
+
+  const bool ok = 0 == *exec ;
+
+  if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
+
+  lock_begin = 0 ; // release lock
+
+  if ( ok ) { new( *exec ) QthreadExec(); }
+
+  //----------------------------------------
+  // Wait for all calls to complete to insure that each worker has executed.
+
+  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
+
+  while ( lock_end );
+
+  //----------------------------------------
+
+  return 0 ;
+}
+
+void verify_is_process( const char * const label , bool not_active = false )
+{
+  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
+  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
+
+  if ( not_process || is_active ) {
+    std::string msg( label );
+    msg.append( " : FAILED" );
+    if ( not_process ) msg.append(" : not called by main process");
+    if ( is_active )   msg.append(" : parallel execution in progress");
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+}
+
+QthreadExec::QthreadExec()
+{
+  const int shepherd_rank        = qthread_shep();
+  const int shepherd_worker_rank = qthread_worker_local(NULL);
+  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
+
+  m_worker_base          = s_exec ;
+  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
+  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size ;
+  m_reduce_end           = s_worker_reduce_end ;
+  m_shared_iter          = s_worker_shared_begin ;
+  m_shepherd_rank        = shepherd_rank ;
+  m_shepherd_size        = s_number_shepherds ;
+  m_shepherd_worker_rank = shepherd_worker_rank ;
+  m_shepherd_worker_size = s_number_workers_per_shepherd ;
+  m_worker_rank          = worker_rank ;
+  m_worker_size          = s_number_workers ;
+  m_worker_state         = QthreadExec::Active ;
+}
+
+void QthreadExec::clear_workers()
+{
+  for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
+    free( s_exec[iwork] );
+    s_exec[iwork] = 0 ;
+  }
+}
+
+void QthreadExec::shared_reset()
+{ m_shared_iter = s_worker_shared_begin ; }
+
+void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
+{
+  const int exec_all_reduce_alloc = align_alloc( reduce_size );
+  const int shepherd_scan_alloc   = align_alloc( 8 );
+  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
+
+  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
+       s_worker_shared_end < shepherd_shared_end ) {
+
+    // Clear current worker memory before allocating new worker memory
+    clear_workers();
+
+    // Increase the buffers to an aligned allocation
+    s_worker_reduce_end   = exec_all_reduce_alloc ;
+    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
+    s_worker_shared_end   = shepherd_shared_end ;
+
+    // Need to query which shepherd this main 'process' is running...
+
+    // Have each worker resize its memory for proper first-touch
+    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+    for ( int i = jshep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
+
+      // Unit tests hang with this call:
+      //
+      // qthread_fork_to_local_priority( driver_resize_workers , NULL , NULL , jshep );
+      //
+
+      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
+    }}
+
+    driver_resize_worker_scratch( NULL );
+
+    // Verify all workers allocated
+
+    bool ok = true ;
+    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
+
+    if ( ! ok ) {
+      std::ostringstream msg ;
+      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
+      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
+         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
+      }
+      msg << " }" ;
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+}
+
+void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
+{
+  verify_is_process("QthreadExec::exec_all(...)",true);
+
+  s_active_function     = func ;
+  s_active_function_arg = arg ;
+
+  // Need to query which shepherd this main 'process' is running...
+
+  for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+  for ( int i = jshep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
+
+    // Unit tests hang with this call:
+    //
+    // qthread_fork_to_local_priority( driver_exec_all , NULL , NULL , jshep );
+    //
+
+    qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
+  }}
+
+  driver_exec_all( NULL );
+
+  s_active_function     = 0 ;
+  s_active_function_arg = 0 ;
+}
+
+void * QthreadExec::exec_all_reduce_result()
+{
+  return s_exec[0]->m_scratch_alloc ;
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
+
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
@ -0,0 +1,432 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADEXEC_HPP
+#define KOKKOS_QTHREADEXEC_HPP
+
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+class QthreadExec ;
+
+typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
+
+class QthreadExec {
+private:
+
+  enum { Inactive = 0 , Active = 1 };
+
+  const QthreadExec * const * m_worker_base ;
+  const QthreadExec * const * m_shepherd_base ;
+
+  void  * m_scratch_alloc ;  ///< Scratch memory [ reduce , team , shared ]
+  int     m_reduce_end ;     ///< End of scratch reduction memory
+  int     m_shared_iter ;    ///< Iterator through scratch memory.
+
+  int     m_shepherd_rank ;
+  int     m_shepherd_size ;
+
+  int     m_shepherd_worker_rank ;
+  int     m_shepherd_worker_size ;
+
+  /*
+   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
+   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
+   */
+  int     m_worker_rank ;
+  int     m_worker_size ;
+
+  int mutable volatile m_worker_state ;
+
+
+  friend class Kokkos::Qthread ;
+
+  ~QthreadExec();
+  QthreadExec( const QthreadExec & );
+  QthreadExec & operator = ( const QthreadExec & );
+
+public:
+
+  QthreadExec();
+
+  /** Execute the input function on all available Qthread workers */
+  static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
+
+  //----------------------------------------
+  /** Barrier across all workers participating in the 'exec_all' */
+  void exec_all_barrier() const
+    {
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  /** Barrier across workers within the shepherd with rank < team_rank */
+  void shepherd_barrier( const int team_size ) const
+    {
+      if ( m_shepherd_worker_rank < team_size ) {
+
+        const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+        int n , j ;
+
+        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+          Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+        }
+
+        if ( rev_rank ) {
+          m_worker_state = QthreadExec::Inactive ;
+          Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+        }
+    
+        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+          m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+        }
+      }
+    }
+
+  //----------------------------------------
+  /** Reduce across all workers participating in the 'exec_all' */
+  template< class FunctorType >
+  inline
+  void exec_all_reduce( const FunctorType & func ) const
+    {
+      typedef ReduceAdapter< FunctorType >  Reduce ;
+
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        const QthreadExec & fan = *m_worker_base[j];
+
+        Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
+
+        Reduce::join( func , m_scratch_alloc , fan.m_scratch_alloc );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  //----------------------------------------
+  /** Scall across all workers participating in the 'exec_all' */
+  template< class FunctorType >
+  inline
+  void exec_all_scan( const FunctorType & func ) const
+    {
+      typedef ReduceAdapter< FunctorType >  Reduce ;
+
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        // Root thread scans across values before releasing threads
+        // Worker data is in reverse order, so m_worker_base[0] is the 
+        // highest ranking thread.
+
+        // Copy from lower ranking to higher ranking worker.
+        for ( int i = 1 ; i < n ; ++i ) {
+          Reduce::copy( func , m_worker_base[i-1]->m_scratch_alloc
+                             , m_worker_base[i]->m_scratch_alloc );
+        }
+
+        Reduce::init( func , m_worker_base[n-1]->m_scratch_alloc );
+
+        // Join from lower ranking to higher ranking worker.
+        // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
+        for ( int i = n - 1 ; --i ; ) {
+          Reduce::join( func , m_worker_base[i-1]->m_scratch_alloc
+                             , m_worker_base[i]->m_scratch_alloc );
+        }
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  //----------------------------------------
+
+  template< class Type>
+  inline
+  volatile Type * shepherd_scan_value() const
+    { return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
+
+  template< class Type >
+  inline
+  Type shepherd_scan( const int team_size
+                    , const Type & value
+                    ,       Type * const global_value = 0 ) const
+    {
+      typedef volatile Type * pointer ;
+
+      *shepherd_scan_value<Type>() = value ;
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        // Root thread scans across values before releasing threads
+        // Worker data is in reverse order, so m_shepherd_base[0] is the 
+        // highest ranking thread.
+
+        // Copy from lower ranking to higher ranking worker.
+
+        Type accum = * m_shepherd_base[0]->shepherd_scan_value<Type>();
+        for ( int i = 1 ; i < n ; ++i ) {
+          const Type tmp = * m_shepherd_base[i]->shepherd_scan_value<Type>();
+          accum += tmp ;
+          * m_shepherd_base[i-1]->shepherd_scan_value<Type>() = tmp ;
+        }
+
+        * m_shepherd_base[n-1]->shepherd_scan_value<Type>() =
+          global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
+
+        // Join from lower ranking to higher ranking worker.
+        for ( int i = n ; --i ; ) {
+          * m_shepherd_base[i-1]->shepherd_scan_value<Type>() += * m_shepherd_base[i]->shepherd_scan_value<Type>();
+        }
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+
+      return *shepherd_scan_value<Type>();
+    }
+
+  //----------------------------------------
+
+  static inline
+  int align_alloc( int size )
+    {
+      enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
+      enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
+      return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
+    }
+
+  void shared_reset();
+
+  void * exec_all_reduce_value() const { return m_scratch_alloc ; }
+
+  static void * exec_all_reduce_result();
+
+  static void resize_worker_scratch( const int reduce_size , const int shared_size );
+  static void clear_workers();
+
+  //----------------------------------------
+
+  inline int worker_rank() const { return m_worker_rank ; }
+  inline int worker_size() const { return m_worker_size ; }
+  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
+  inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
+  inline int shepherd_rank() const { return m_shepherd_rank ; }
+  inline int shepherd_size() const { return m_shepherd_size ; }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class WorkArgTag >
+class TeamPolicy< Kokkos::Qthread , WorkArgTag > {
+private:
+  const int m_league_size ;
+  const int m_team_size ;
+  const int m_shepherd_iter ;
+
+public:
+
+  typedef Impl::ExecutionPolicyTag  kokkos_tag ;
+  typedef Qthread                   execution_space ;
+
+
+  // One active team per shepherd
+  TeamPolicy( Kokkos::Qthread & q
+            , const int league_size
+            , const int team_size
+            )
+    : m_league_size( league_size )
+    , m_team_size( team_size < q.shepherd_worker_size()
+                 ? team_size : q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+    {
+    }
+
+  // One active team per shepherd
+  TeamPolicy( const int league_size
+            , const int team_size
+            )
+    : m_league_size( league_size )
+    , m_team_size( team_size < Qthread::instance().shepherd_worker_size()
+                 ? team_size : Qthread::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
+    {
+    }
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return q.shepherd_worker_size(); }
+
+  class member_type {
+  private:
+          Impl::QthreadExec   & m_exec ;
+    const int                   m_team_size ;
+    const int                   m_team_rank ;
+    const int                   m_league_size ;
+    const int                   m_league_end ;
+          int                   m_league_rank ;
+
+  public:
+
+    KOKKOS_INLINE_FUNCTION
+    Kokkos::Qthread::scratch_memory_space team_shmem() const
+      { return Kokkos::Qthread::scratch_memory_space( m_exec ); }
+
+    KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+    KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+    KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+    KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+    KOKKOS_INLINE_FUNCTION void team_barrier() const
+      { m_exec.shepherd_barrier( m_team_size ); }
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+     *
+     *  The highest rank thread can compute the reduction total as
+     *    reduction_total = dev.team_scan( value ) + value ;
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+      { return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+     *          with intra-team non-deterministic ordering accumulation.
+     *
+     *  The global inter-team accumulation value will, at the end of the
+     *  league's parallel execution, be the scan's total.
+     *  Parallel execution ordering of the league's teams is non-deterministic.
+     *  As such the base value for each team's scan operation is similarly
+     *  non-deterministic.
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
+      { return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
+
+    //----------------------------------------
+    // Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
+
+    // Initialize
+    member_type( Impl::QthreadExec & exec , const TeamPolicy & team )
+      : m_exec( exec )
+      , m_team_size(   team.m_team_size )
+      , m_team_rank(   exec.shepherd_worker_rank() )
+      , m_league_size( team.m_league_size )
+      , m_league_end(  team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
+      , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
+    {}
+
+    // Continue
+    operator bool () const { return m_league_rank < m_league_end ; }
+
+    // iterate
+    void next_team() { ++m_league_rank ; }
+  };
+};
+
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_QTHREADEXEC_HPP */
+
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
@ -0,0 +1,274 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
+#define KOKKOS_QTHREAD_PARALLEL_HPP
+
+#include <vector>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+
+#include <Qthread/Kokkos_QthreadExec.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Kokkos::Qthread , void , IntType , P >
+                 , Kokkos::Qthread
+                 >
+{
+public:
+  typedef Kokkos::RangePolicy< Kokkos::Qthread , void , IntType , P >  Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  // Function is called once by every concurrent thread.
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+    const Policy range( self.m_policy , exec.worker_rank() , exec.worker_size() );
+
+    const typename Policy::member_type work_end = range.end();
+    for ( typename Policy::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+      self.m_func( iwork );
+    }
+
+    // All threads wait for completion.
+    exec.exec_all_barrier();
+  }
+
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy
+             )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Kokkos::Qthread , void , IntType , P >
+                    , Kokkos::Qthread
+                    >
+{
+public:
+
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+  typedef Kokkos::RangePolicy< Kokkos::Qthread , void , IntType , P >  Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const Policy range( self.m_policy , exec.worker_rank() , exec.worker_size() );
+
+    // Initialize thread-local value
+    typename Reduce::reference_type update = Reduce::init( self.m_func , exec.exec_all_reduce_value() );
+
+    const typename Policy::member_type work_end = range.end();
+    for ( typename Policy::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+      self.m_func( iwork , update );
+    }
+
+    exec.exec_all_reduce( self.m_func );
+  }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & functor
+                , const Policy       & policy
+                , const HostViewType & result_view )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      QthreadExec::resize_worker_scratch( Reduce::value_size( m_func ) , 0 );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+
+      Reduce::final( m_func , data );
+
+      if ( result_view.ptr_on_device() ) {
+        const unsigned n = Reduce::value_count( m_func );
+        for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
+      }
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType >
+class ParallelReduce< FunctorType , TeamPolicy< Kokkos::Qthread > , Kokkos::Qthread >
+{
+public:
+
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+  typedef TeamPolicy< Kokkos::Qthread >  policy_team ;
+
+  const FunctorType  m_func ;
+  const policy_team  m_team ;
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    // Initialize thread-local value
+    typename Reduce::reference_type update = Reduce::init( self.m_func , exec.exec_all_reduce_value() );
+
+    typename policy_team::member_type team_index( exec , self.m_team );
+
+    while ( team_index ) {
+      // Reset shared memory offset to beginning of reduction range.
+      exec.shared_reset();
+      self.m_func( team_index , update );
+      team_index.team_barrier();
+      team_index.next_team();
+    }
+
+    exec.exec_all_reduce( self.m_func );
+  }
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType & functor ,
+                  const policy_team & policy ,
+                  const ViewType    & result )
+    : m_func( functor )
+    , m_team( policy )
+    {
+      QthreadExec::resize_worker_scratch
+        ( /* reduction   memory */ Reduce::value_size( functor )
+        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor ) , policy.team_size() );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+
+      Reduce::final( m_func , data );
+
+      const unsigned n = Reduce::value_count( m_func );
+      for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
+    }
+
+  inline void wait() {}
+
+  inline ~ParallelReduce() { wait(); }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Kokkos::Qthread , void , IntType , P >
+                  , Kokkos::Qthread
+                  >
+{
+public:
+
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+  typedef Kokkos::RangePolicy< Kokkos::Qthread , void , IntType , P > Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+    const Policy range( self.m_policy , exec.worker_rank() , exec.worker_size() );
+
+    // Initialize thread-local value
+    typename Reduce::reference_type update = Reduce::init( self.m_func , exec.exec_all_reduce_value() );
+
+    const typename Policy::member_type work_end = range.end();
+    for ( typename Policy::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+      self.m_func( iwork , update , false );
+    }
+
+    exec.exec_all_scan( self.m_func );
+
+    for ( typename Policy::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+      self.m_func( iwork , update , true );
+    }
+
+    exec.exec_all_barrier();
+  }
+
+  ParallelScan( const FunctorType & functor
+              , const Policy      & policy
+              )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      QthreadExec::resize_worker_scratch( Reduce::value_size( m_func ) , 0 );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::execute , this );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
@ -0,0 +1,234 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_HAVE_QTHREAD )
+
+#include <stdio.h>
+
+#include <stdlib.h>
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <Kokkos_Atomic.hpp>
+#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+TaskManager< Kokkos::Qthread > s_task_manager ;
+
+}
+
+typedef TaskMember<  Kokkos::Qthread > Task ;
+typedef TaskManager< Kokkos::Qthread > Mgr ;
+
+Task::TaskMember( const function_type    arg_destroy
+                , const function_type    arg_apply
+                , const std::type_info & arg_type
+                )
+  : m_typeid(  arg_type )
+  , m_destroy( arg_destroy )
+  , m_apply(   arg_apply )
+  , m_state( STATE_CONSTRUCTING )
+  , m_ref_count(0)
+  , m_qfeb(0)
+{
+  qthread_empty( & m_qfeb ); // Set to full when complete
+  for ( int i = 0 ; i < MAX_DEPENDENCE ; ++i ) m_dep[i] = 0 ;
+}
+
+Mgr::TaskManager()
+{}
+
+void * Mgr::memory_allocate( size_t nbytes )
+{
+  // Counting on 'malloc' thread safety so lock/unlock not required.
+  // However, isolate calls here to mitigate future need to introduce lock/unlock.
+
+  // lock
+
+  void * ptr = malloc( nbytes );
+
+  // unlock
+
+  return ptr ;
+}
+
+void Mgr::memory_deallocate( void * ptr )
+{
+  // Counting on 'free' thread safety so lock/unlock not required.
+  // However, isolate calls here to mitigate future need to introduce lock/unlock.
+
+  // lock
+
+  free( ptr );
+
+  // unlock
+}
+
+void Mgr::assign( Task ** const lhs , Task * const rhs )
+{
+  if ( *lhs ) {
+
+    // Must de-assign
+
+    const int count = Kokkos::atomic_fetch_add( & (**lhs).m_ref_count , -1 );
+
+    if ( 1 == count ) {
+
+      // Should only be deallocating a completed task
+      // TODO: Support deletion of canceled tasks.
+
+      if ( (**lhs).m_state != Task::STATE_COMPLETE ) {
+        throw std::runtime_error(
+          std::string("Kokkos::Impl::TaskManager<Kokkos::Qthread>::decrement ERROR: not STATE_COMPLETE") );
+      }
+
+      // Get destructor function and apply it
+      (**lhs).m_destroy( *lhs );
+
+      memory_deallocate( *lhs );
+    }
+    else if ( count <= 0 ) {
+      throw std::runtime_error(std::string("Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR: reference counting") );
+    }
+  }
+
+  if ( rhs ) {
+    Kokkos::atomic_fetch_add( & (*rhs).m_ref_count , 1 );
+  }
+
+  *lhs = rhs ;
+}
+
+void Mgr::verify_set_dependence( Task * t , int n )
+{
+  // Must be either constructing for original spawn or executing for a respawn.
+
+  if ( Task::STATE_CONSTRUCTING != t->m_state &&
+       Task::STATE_EXECUTING    != t->m_state ) {
+    throw std::runtime_error(std::string("Kokkos::Impl::Task spawn or respawn state error"));
+  }
+
+  if ( MAX_DEPENDENCE <= n ) {
+    throw std::runtime_error(std::string("Kokkos::Impl::Task spawn or respawn dependence count error"));
+  }
+}
+
+void Mgr::schedule( Task * t )
+{
+  // Is waiting for execution
+
+  // spawn in qthread.  must malloc the precondition array and give to qthread.
+  // qthread will eventually free this allocation so memory will not be leaked.
+
+  // concern with thread safety of malloc, does this need to be guarded?
+  aligned_t ** qprecon = (aligned_t **) memory_allocate( ( MAX_DEPENDENCE + 1 ) * sizeof(aligned_t *) );
+
+  uintptr_t npre = 0 ;
+  for ( ; npre < MAX_DEPENDENCE && t->m_dep[npre] ; ++npre ) {
+    qprecon[npre+1] = & t->m_dep[npre]->m_qfeb ; // Qthread precondition flag
+  }
+  qprecon[0] = reinterpret_cast<aligned_t *>( npre );
+
+  t->m_state = Task::STATE_WAITING ;
+
+  qthread_spawn( & Mgr::qthread_func , t , 0 , NULL
+               , npre , qprecon
+               , NO_SHEPHERD , QTHREAD_SPAWN_SIMPLE );
+}
+
+aligned_t Mgr::qthread_func( void * arg )
+{
+  Task * const task = reinterpret_cast< Task * >(arg);
+
+  task->m_state = Task::STATE_EXECUTING ;
+
+  (*task->m_apply)( task );
+
+  if ( task->m_state == Task::STATE_EXECUTING ) {
+    // Task did not respawn, is complete
+    task->m_state = Task::STATE_COMPLETE ;
+
+    // Release dependences before allowing dependent tasks to run.
+    // Otherwise their is a thread race condition for removing dependences.
+    for ( int i = 0 ; i < MAX_DEPENDENCE ; ++i ) {
+      assign( & task->m_dep[i] , 0 );
+    }
+
+    // Set qthread FEB to full so that dependent tasks are allowed to execute
+    qthread_fill( & task->m_qfeb );
+  }
+
+  return 0 ;
+}
+
+
+void Mgr::wait( Task * t )
+{
+  aligned_t tmp ;
+  qthread_readFF( & tmp , & t->m_qfeb );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+TaskPolicy< Kokkos::Qthread >::TaskPolicy()
+  : m_task_manager( Impl::s_task_manager )
+{}
+
+} // namespace Kokkos
+
+#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
+
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
@ -0,0 +1,643 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_QTHREAD_TASKPOLICY_HPP
+#define KOKKOS_QTHREAD_TASKPOLICY_HPP
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+#include <qthread.h>
+
+#include <Kokkos_Qthread.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_View.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskMember< Kokkos::Qthread , void , void >
+{
+public:
+
+  friend class TaskManager< Kokkos::Qthread > ;
+
+  enum { MAX_DEPENDENCE = 13 };
+
+  /**\brief  States of a task */
+  enum { STATE_CONSTRUCTING = 0 , STATE_WAITING = 1 , STATE_EXECUTING = 2 , STATE_COMPLETE = 4 };
+
+  /**\brief  Base dependence count when a task is allocated.
+   *         A separate dependence array is allocated when the number
+   *         of dependences exceeds this count.
+   */
+
+  typedef void (* function_type)( TaskMember * );
+
+  const std::type_info & m_typeid ;
+  const function_type    m_destroy ;
+  const function_type    m_apply ;
+
+private:
+
+  int            m_state ;
+  int            m_ref_count ; ///< Reference count
+  aligned_t      m_qfeb ;
+  TaskMember   * m_dep[ MAX_DEPENDENCE ]; ///< Dependences of this task
+
+  TaskMember( const TaskMember & );
+  TaskMember & operator = ( const TaskMember & );
+
+  static aligned_t qthread_func( void * );
+
+protected :
+
+  TaskMember( const function_type    arg_destroy
+            , const function_type    arg_apply
+            , const std::type_info & arg_type
+            );
+
+public:
+
+  inline static
+  TaskMember * verify_type( TaskMember * t ) { return t ; }
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  get_result_type get() const { return get_result_type() ; }
+
+  inline
+  TaskMember * get_dependence( int i ) const
+    { return ( STATE_EXECUTING == m_state && 0 <= i && i < MAX_DEPENDENCE ) ? m_dep[i] : (TaskMember*) 0 ; }
+
+  inline
+  int get_dependence() const
+    {
+      int i = 0 ;
+      if ( STATE_EXECUTING == m_state ) { for ( ; i < MAX_DEPENDENCE && m_dep[i] != 0 ; ++i ); }
+      return i ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskManager< Kokkos::Qthread >
+{
+public:
+
+  typedef TaskMember< Kokkos::Qthread > task_root_type ;
+
+  enum { MAX_DEPENDENCE = task_root_type::MAX_DEPENDENCE };
+
+  static void verify_set_dependence( task_root_type * , int );
+
+  static void assign( task_root_type ** const , task_root_type * const );
+
+  static void wait( task_root_type * );
+
+  static void * memory_allocate( size_t );
+  static void   memory_deallocate( void * );
+
+  static void schedule( task_root_type * );
+
+  template < class DerivedTaskMember >
+  static
+  void destroy( task_root_type * t )
+    { static_cast< DerivedTaskMember * >( t )->~DerivedTaskMember(); }
+
+  template< class A1 , class A2 >
+  static
+  void schedule( task_root_type * t
+               , const Future<A1,A2> * const dep
+               , typename Impl::enable_if
+                  < Impl::is_same< typename Future<A1,A2>::execution_space , Kokkos::Qthread >::value
+                  , const int >::type n
+                )
+    {
+      verify_set_dependence( t , n );
+      int i = 0 ;
+      for ( ; i < n ; ++i )              assign( & t->m_dep[i] , dep[i].m_task );
+      for ( ; i < MAX_DEPENDENCE ; ++i ) assign( & t->m_dep[i] , 0 );
+      schedule( t );
+    }
+
+  template< class A1 , class A2 >
+  void wait( const Future<A1,A2> & f ) { wait( f.m_task ); }
+
+  TaskManager();
+  TaskManager( const TaskManager & );
+  TaskManager & operator = ( const TaskManager & );
+
+private:
+
+  static aligned_t qthread_func( void * arg );
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template < class ResultType >
+class TaskMember< Kokkos::Qthread , ResultType , void > : public TaskMember< Kokkos::Qthread >
+{
+private:
+
+protected:
+
+  typedef TaskMember< Kokkos::Qthread >::function_type  function_type ;
+
+  inline
+  TaskMember( const function_type    arg_destroy
+            , const function_type    arg_apply
+            )
+    : TaskMember< Kokkos::Qthread >( arg_destroy , arg_apply , typeid(ResultType) )
+    , m_result()
+    {}
+
+public:
+
+  ResultType  m_result ;
+
+  inline static
+  TaskMember *
+  verify_type( TaskMember< Kokkos::Qthread > * t )
+    {
+      if ( t != 0 && t->m_typeid != typeid(ResultType) ) {
+        throw std::runtime_error( std::string("Kokkos::Future bad cast for result type"));
+      }
+      return static_cast< TaskMember *>( t );
+    }
+
+  typedef const ResultType & get_result_type ;
+
+  inline
+  get_result_type get() const { return m_result ; }
+};
+
+//----------------------------------------------------------------------------
+
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Qthread , ResultType >
+  , public FunctorType
+{
+private:
+
+  typedef TaskManager< Kokkos::Qthread >             task_manager ;
+  typedef TaskMember< Kokkos::Qthread >              member_root_type ;
+  typedef TaskMember< Kokkos::Qthread , ResultType > member_base_type ;
+
+  static
+  void apply( member_root_type * t )
+    {
+      member_base_type * m = static_cast< member_base_type * >(t);
+      static_cast< TaskMember * >(m)->FunctorType::apply( m->m_result );
+    }
+
+protected:
+
+  inline 
+  TaskMember( const typename member_root_type::function_type  arg_destroy
+            , const typename member_root_type::function_type  arg_apply
+            , const FunctorType &  arg_functor
+            )
+    : member_base_type( arg_destroy , arg_apply )
+    , FunctorType( arg_functor )
+    {}
+
+public:
+
+  inline 
+  TaskMember( const FunctorType &  arg_functor )
+    : member_base_type( & task_manager::template destroy< TaskMember >
+                      , & TaskMember::apply )
+    , FunctorType( arg_functor )
+    {}
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType >
+class TaskMember< Kokkos::Qthread , void , FunctorType >
+  : public TaskMember< Kokkos::Qthread >
+  , public FunctorType
+{
+private:
+
+  typedef TaskManager< Kokkos::Qthread >  task_manager ;
+  typedef TaskMember< Kokkos::Qthread >   member_root_type ;
+
+  static
+  void apply( member_root_type * t )
+    { static_cast< TaskMember * >(t)->FunctorType::apply(); }
+
+protected:
+
+  inline 
+  TaskMember( const typename member_root_type::function_type  arg_destroy
+            , const typename member_root_type::function_type  arg_apply
+            , const FunctorType &  arg_functor
+            )
+    : member_root_type( arg_destroy , arg_apply )
+    , FunctorType( arg_functor )
+    {}
+
+public:
+
+  inline 
+  TaskMember( const FunctorType &  arg_functor )
+    : member_root_type( & task_manager::template destroy< TaskMember >
+                      , & TaskMember::apply )
+    , FunctorType( arg_functor )
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskPolicy< Impl::TaskDepends< Kokkos::Qthread > >
+{
+public:
+
+  typedef Kokkos::Qthread execution_space ;
+
+private:
+
+  enum { MAX_DEPENDENCE = Impl::TaskMember< execution_space >::MAX_DEPENDENCE };
+
+  Kokkos::Impl::TaskManager< execution_space >  & m_task_manager ;
+  Kokkos::Future< execution_space >               m_depends[ MAX_DEPENDENCE ];
+
+  TaskPolicy();
+  TaskPolicy & operator = ( const TaskPolicy & );
+
+public:
+
+  template< typename A1 , typename A2 >
+  TaskPolicy( Kokkos::Impl::TaskManager< execution_space > & manager
+            , const size_t n
+            , const Future< A1 , A2 > * const dep )
+    : m_task_manager( manager )
+    {
+      int i = 0 ;
+      for ( ; i < n ; ++i ) m_depends[i] = dep[i] ;
+      for ( ; i < MAX_DEPENDENCE ; ++i ) m_depends[i] = Future< execution_space >();
+    }
+
+  // Spawn a serial task:
+  template< class FunctorType , class ValueType >
+  Future< ValueType , execution_space >
+  spawn( const FunctorType & functor ) const
+    {
+      // Allocate a copy functor and insert into queue
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > member_type ;
+      member_type * m = new( m_task_manager.memory_allocate( sizeof(member_type) ) ) member_type( functor );
+      m_task_manager.schedule( m , m_depends , MAX_DEPENDENCE );
+      return Future< ValueType , execution_space >( m );
+    }
+
+  // Construct a task policy for foreach-range tasks:
+  // spawn( task_policy.depends(N,d).foreach(RangePolicy) , functor );
+  // spawn( task_policy.foreach(RangePolicy) , functor );
+  template< class ExecPolicy >
+  TaskPolicy< Impl::TaskForEach< ExecPolicy > >
+  foreach( const ExecPolicy & arg_policy )
+    { return TaskPolicy< Impl::TaskForEach< ExecPolicy > >( m_task_manager , arg_policy , m_depends ); }
+
+  // Construct a task policy for reduce-range tasks:
+  template< class ExecPolicy >
+  TaskPolicy< Impl::TaskForEach< ExecPolicy > >
+  reduce( const ExecPolicy & arg_policy )
+    { return TaskPolicy< Impl::TaskReduce< ExecPolicy > >( m_task_manager , arg_policy , m_depends ); }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskPolicy< Kokkos::Qthread >
+{
+public:
+
+  typedef Kokkos::Qthread execution_space ;
+
+private:
+
+  typedef Impl::TaskMember< execution_space , void , void > task_base_type ;
+
+  Kokkos::Impl::TaskManager< execution_space > & m_task_manager ;
+
+  template< class FunctorType >
+  static
+  void apply( task_base_type * t )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType >  member_type ;
+      static_cast< member_type * >(t)->FunctorType::apply();
+    }
+
+  TaskPolicy & operator = ( const TaskPolicy & );
+
+public:
+
+  TaskPolicy();
+  TaskPolicy( const TaskPolicy & rhs )
+    : m_task_manager( rhs.m_task_manager ) {}
+
+  // Requires:
+  // class DerivedMemberType : public TaskMember< execution_space , typename FunctorType::value_type , FunctorType > ...
+  template< class FunctorType >
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      typedef const Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType >  member_type ;
+      return Future<void,execution_space>( static_cast< member_type * >(task_functor)->task_base_type::get_dependence(i) );
+    }
+
+  template< class FunctorType >
+  int get_dependence( const FunctorType * task_functor ) const
+    {
+      typedef const Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType >  member_type ;
+      return static_cast< member_type * >(task_functor)->task_base_type::get_dependence();
+    }
+
+  template< class A1 , class A2 >
+  void wait( const Future<A1,A2> & f ) const { m_task_manager.wait( f ); }
+
+  template< class FunctorType , class A1 , class A2 >
+  void respawn( FunctorType * task_functor
+              , const Future<A1,A2> * const dep
+              , typename Impl::enable_if
+                  < Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                  , const int
+                  >::type n
+              ) const
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType >  member_type ;
+      m_task_manager.schedule( static_cast< member_type * >( task_functor ) , dep , n );
+    }
+
+  // Allocate a copy functor and insert into queue
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  spawn( const FunctorType & functor ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  member_type ;
+      member_type * m = new( m_task_manager.memory_allocate( sizeof(member_type) ) ) member_type( functor );
+      m_task_manager.schedule( m );
+      return Future< value_type , execution_space >( m );
+    }
+
+  // Construct a task policy with dependences:
+  // spawn( task_policy.depends(N,d) , functor );
+  template< class A1 , class A2 >
+  TaskPolicy< Impl::TaskDepends< execution_space > >
+  depends( const Future< A1 , A2 > * const d
+         , typename Impl::enable_if<
+             ( Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+             ), const int >::type n 
+         )
+    { return TaskPolicy< Impl::TaskDepends< execution_space > >( m_task_manager , n , d ); }
+
+  // Construct a task policy for foreach-range tasks:
+  // spawn( task_policy.depends(N,d).foreach(RangePolicy) , functor );
+  // spawn( task_policy.foreach(RangePolicy) , functor );
+  template< class ExecPolicy >
+  TaskPolicy< Impl::TaskForEach< ExecPolicy > >
+  foreach( const ExecPolicy & arg_policy )
+    { return TaskPolicy< Impl::TaskForEach< ExecPolicy > >( m_task_manager , arg_policy ); }
+
+  // Construct a task policy for reduce-range tasks:
+  template< class ExecPolicy >
+  TaskPolicy< Impl::TaskReduce< ExecPolicy > >
+  reduce( const ExecPolicy & arg_policy )
+    { return TaskPolicy< Impl::TaskReduce< ExecPolicy > >( m_task_manager , arg_policy ); }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename IntType , unsigned P >
+class TaskPolicy< Impl::TaskForEach< Kokkos::RangePolicy< Kokkos::Qthread , void , IntType , P >  >  >
+{
+public:
+
+  typedef Kokkos::Qthread execution_space ;
+
+private:
+
+  typedef RangePolicy< execution_space , void , IntType , P > range_policy ;
+  typedef Impl::TaskManager< execution_space >  task_manager ;
+  typedef Impl::TaskMember<  execution_space >  task_root_type ;
+
+  task_manager & m_task_manager ;
+  range_policy   m_range_policy ;
+
+  // ForEach task
+  template< class FunctorType >
+  class member_type : public Impl::TaskMember< Kokkos::Qthread , void , FunctorType >
+  {
+  private:
+
+    typedef Impl::TaskMember< Kokkos::Qthread , void , FunctorType >    task_base_type ;
+
+    range_policy  m_policy ;
+
+    static
+    void apply( task_root_type * t )
+      {
+        range_policy const & r  = * static_cast< member_type * >( static_cast< task_base_type * >( t ) ).m_policy ;
+        FunctorType        & f  = * static_cast< FunctorType * >( static_cast< task_base_type * >( t ) );
+        FunctorType  const & cf = f ;
+
+        const IntType e = r.end();
+        for ( IntType i = r.begin() ; i < e ; ++i ) { cf(i); }
+        f.apply();
+      }
+
+  public:
+
+    member_type( const FunctorType  & arg_func 
+               , const range_policy & arg_policy
+               )
+      : task_base_type( & task_manager::template destroy< member_type >
+                      , & member_type::apply
+                      , arg_func
+                      )
+      , m_policy( arg_policy )
+      {}
+  };
+
+
+  TaskPolicy();
+  TaskPolicy & operator = ( const TaskPolicy & );
+
+public:
+
+  TaskPolicy( task_manager & manager , const range_policy & policy )
+    : m_task_manager( manager )
+    , m_range_policy( policy )
+    {}
+
+  template< class FunctorType , class ValueType >
+  Future< ValueType , execution_space >
+  spawn( const FunctorType & functor ) const
+    {
+      typedef Future< ValueType , execution_space > future_type ;
+
+      // Allocate a copy functor and insert into queue
+
+      task_root_type * const t = new( m_task_manager.memory_allocate( sizeof(member_type<FunctorType>) ) ) member_type<FunctorType>( functor , m_range_policy );
+
+      m_task_manager.schedule( t );
+
+      return future_type( t );
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename IntType , unsigned P >
+class TaskPolicy< Impl::TaskReduce< Kokkos::RangePolicy< Kokkos::Qthread , void , IntType , P >  >  >
+{
+public:
+
+  typedef Kokkos::Qthread execution_space ;
+
+private:
+
+  typedef RangePolicy< execution_space , void , IntType , P >  range_policy ;
+  typedef Impl::TaskManager< execution_space >  task_manager ;
+  typedef Impl::TaskMember<  execution_space >  task_root_type ;
+
+  task_manager & m_task_manager ;
+  range_policy   m_range_policy ;
+
+  // ForEach task
+  template< class FunctorType >
+  class member_type : public Impl::TaskMember< Kokkos::Qthread , typename FunctorType::value_type , FunctorType >
+  {
+  private:
+    typedef typename FunctorType::value_type value_type ;
+
+    typedef Impl::TaskMember< Kokkos::Qthread , value_type , FunctorType >    task_base_type ;
+    typedef Impl::TaskMember< Kokkos::Qthread , value_type >    task_value_type ;
+
+    range_policy  m_policy ;
+
+    static
+    void apply( task_root_type * t )
+      {
+        task_base_type     & b  = * static_cast< task_base_type * >( t );
+        range_policy const & r  = static_cast< member_type & >( b ).m_policy ;
+        FunctorType        & f  = static_cast< FunctorType & >( b );
+        FunctorType  const & cf = f ;
+
+        cf.init( b.m_result );
+        const IntType e = r.end();
+        for ( IntType i = r.begin() ; i < e ; ++i ) { cf(i,b.m_result); }
+        f.apply( b.m_result );
+      }
+
+  public:
+
+    member_type( const FunctorType  & arg_func 
+               , const range_policy & arg_policy
+               )
+      : task_base_type( & task_manager::template destroy< member_type >
+                      , & member_type::apply
+                      , arg_func
+                      )
+      , m_policy( arg_policy )
+      {}
+  };
+
+  TaskPolicy();
+  TaskPolicy & operator = ( const TaskPolicy & );
+
+public:
+
+  TaskPolicy( task_manager & manager , const range_policy & policy )
+    : m_task_manager( manager )
+    , m_range_policy( policy )
+    {}
+
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  spawn( const FunctorType & functor ) const
+    {
+      typedef Future< typename FunctorType::value_type , execution_space > future_type ;
+
+      // Allocate a copy functor and insert into queue
+
+      task_root_type * const t = new( m_task_manager.memory_allocate( sizeof(member_type<FunctorType>) ) ) member_type<FunctorType>( functor , m_range_policy );
+
+      m_task_manager.schedule( t );
+
+      return future_type( t );
+    }
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
+
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@ -0,0 +1,744 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD )
+
+#include <stdint.h>
+#include <limits>
+#include <utility>
+#include <iostream>
+#include <sstream>
+#include <Kokkos_Threads.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+ThreadsExec                  s_threads_process ;
+ThreadsExec                * s_threads_exec[  ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
+pthread_t                    s_threads_pid[   ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
+std::pair<unsigned,unsigned> s_threads_coord[ ThreadsExec::MAX_THREAD_COUNT ];
+
+int s_thread_pool_size[3] = { 0 , 0 , 0 };
+
+unsigned s_current_reduce_size = 0 ;
+unsigned s_current_shared_size = 0 ;
+
+void (* volatile s_current_function)( ThreadsExec & , const void * );
+const void * volatile s_current_function_arg = 0 ;
+
+struct Sentinel {
+  Sentinel()
+  {
+    HostSpace::register_in_parallel( ThreadsExec::in_parallel );
+  }
+
+  ~Sentinel()
+  {
+    if ( s_thread_pool_size[0] ||
+         s_thread_pool_size[1] ||
+         s_thread_pool_size[2] ||
+         s_current_reduce_size ||
+         s_current_shared_size ||
+         s_current_function ||
+         s_current_function_arg ||
+         s_threads_exec[0] ) {
+      std::cerr << "ERROR : Process exiting without calling Kokkos::Threads::terminate()" << std::endl ;
+    }
+  }
+};
+
+inline
+unsigned fan_size( const unsigned rank , const unsigned size )
+{
+  const unsigned rank_rev = size - ( rank + 1 );
+  unsigned count = 0 ;
+  for ( unsigned n = 1 ; ( rank_rev + n < size ) && ! ( rank_rev & n ) ; n <<= 1 ) { ++count ; }
+  return count ;
+}
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void execute_function_noop( ThreadsExec & , const void * ) {}
+
+void ThreadsExec::driver(void)
+{
+  ThreadsExec this_thread ;
+
+  while ( ThreadsExec::Active == this_thread.m_pool_state ) {
+
+    (*s_current_function)( this_thread , s_current_function_arg );
+
+    // Deactivate thread and wait for reactivation
+    this_thread.m_pool_state = ThreadsExec::Inactive ;
+
+    wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive );
+  }
+}
+
+ThreadsExec::ThreadsExec()
+  : m_pool_base(0)
+  , m_scratch(0)
+  , m_scratch_reduce_end(0)
+  , m_scratch_thread_end(0)
+  , m_pool_rank(0)
+  , m_pool_size(0)
+  , m_pool_fan_size(0)
+  , m_pool_state( ThreadsExec::Terminating )
+{
+  if ( & s_threads_process != this ) {
+
+    // A spawned thread
+
+    ThreadsExec * const nil = 0 ;
+
+    // Which entry in 's_threads_exec', possibly determined from hwloc binding
+    const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0])
+                    ? ((size_t)s_current_function_arg)
+                    : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord ));
+
+    // Given a good entry set this thread in the 's_threads_exec' array
+    if ( entry < s_thread_pool_size[0] &&
+         nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) {
+
+      m_pool_base     = s_threads_exec ;
+      m_pool_rank     = s_thread_pool_size[0] - ( entry + 1 );
+      m_pool_size     = s_thread_pool_size[0] ;
+      m_pool_fan_size = fan_size( m_pool_rank , m_pool_size );
+      m_pool_state    = ThreadsExec::Active ;
+
+      s_threads_pid[ m_pool_rank ] = pthread_self();
+
+      // Inform spawning process that the threads_exec entry has been set.
+      s_threads_process.m_pool_state = ThreadsExec::Active ;
+    }
+    else {
+      // Inform spawning process that the threads_exec entry could not be set.
+      s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+    }
+  }
+  else {
+    // Enables 'parallel_for' to execute on unitialized Threads device
+    m_pool_rank  = 0 ;
+    m_pool_size  = 1 ;
+    m_pool_state = ThreadsExec::Inactive ;
+
+    s_threads_pid[ m_pool_rank ] = pthread_self();
+  }
+}
+
+ThreadsExec::~ThreadsExec()
+{
+  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
+
+  m_pool_base   = 0 ;
+  m_scratch     = 0 ;
+  m_scratch_reduce_end = 0 ;
+  m_scratch_thread_end = 0 ;
+  m_pool_rank     = 0 ;
+  m_pool_size     = 0 ;
+  m_pool_fan_size = 0 ;
+
+  m_pool_state  = ThreadsExec::Terminating ;
+
+  if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) {
+    ThreadsExec * const nil = 0 ;
+
+    atomic_compare_exchange( s_threads_exec + entry , this , nil );
+
+    s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+  }
+}
+
+
+int ThreadsExec::get_thread_count()
+{
+  return s_thread_pool_size[0] ;
+}
+
+ThreadsExec * ThreadsExec::get_thread( const int init_thread_rank )
+{
+  ThreadsExec * const th =
+    init_thread_rank < s_thread_pool_size[0]
+    ? s_threads_exec[ s_thread_pool_size[0] - ( init_thread_rank + 1 ) ] : 0 ;
+
+  if ( 0 == th || th->m_pool_rank != init_thread_rank ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : "
+        << "thread " << init_thread_rank << " of " << s_thread_pool_size[0] ;
+    if ( 0 == th ) {
+      msg << " does not exist" ;
+    }
+    else {
+      msg << " has wrong thread_rank " << th->m_pool_rank ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return th ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_get_binding( ThreadsExec & exec , const void * )
+{
+  s_threads_coord[ exec.m_pool_rank ] = Kokkos::hwloc::get_this_thread_coordinate();
+}
+
+void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
+{
+  ThreadsExec::global_lock();
+  ThreadsExec::global_unlock();
+
+  const int n = exec.m_pool_fan_size ;
+  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
+
+  for ( int i = 0 ; i < n ; ++i ) {
+    Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+  }
+
+  exec.m_pool_state = ThreadsExec::Inactive ;
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void ThreadsExec::verify_is_process( const std::string & name , const bool initialized )
+{
+  if ( ! is_process() ) {
+    std::string msg( name );
+    msg.append( " FAILED : Called by a worker thread, can only be called by the master process." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+  if ( initialized && 0 == s_thread_pool_size[0] ) {
+    std::string msg( name );
+    msg.append( " FAILED : Threads not initialized." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+int ThreadsExec::in_parallel()
+{
+  // A thread function is in execution and
+  // the function argument is not the special threads process argument and
+  // the master process is a worker or is not the master process.
+  return s_current_function &&
+         ( & s_threads_process != s_current_function_arg ) &&
+         ( s_threads_process.m_pool_base || ! is_process() );
+}
+
+// Wait for root thread to become inactive
+void ThreadsExec::fence()
+{
+  if ( s_thread_pool_size[0] ) {
+    // Wait for the root thread to complete:
+    Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+  }
+
+  s_current_function     = 0 ;
+  s_current_function_arg = 0 ;
+}
+
+/** \brief  Begin execution of the asynchronous functor */
+void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg )
+{
+  verify_is_process("ThreadsExec::start" , true );
+
+  if ( s_current_function || s_current_function_arg ) {
+    Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) );
+  }
+
+  s_current_function     = func ;
+  s_current_function_arg = arg ;
+
+  // Activate threads:
+  for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) {
+    s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  if ( s_threads_process.m_pool_size ) {
+    // Master process is the root thread, run it:
+    (*func)( s_threads_process , arg );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::sleep()
+{
+  verify_is_process("ThreadsExec::sleep", true );
+
+  if ( & execute_sleep == s_current_function ) return false ;
+
+  fence();
+
+  ThreadsExec::global_lock();
+
+  s_current_function = & execute_sleep ;
+
+  // Activate threads:
+  for ( unsigned i = s_thread_pool_size[0] ; 0 < i ; ) {
+    s_threads_exec[--i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  return true ;
+}
+
+bool ThreadsExec::wake()
+{
+  verify_is_process("ThreadsExec::wake", true );
+
+  if ( & execute_sleep != s_current_function ) return false ;
+
+  ThreadsExec::global_unlock();
+
+  if ( s_threads_process.m_pool_base ) {
+    execute_sleep( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  fence();
+
+  return true ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
+{
+  s_current_function = func ;
+  s_current_function_arg = & s_threads_process ;
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
+    ThreadsExec & th = * s_threads_exec[ --i ];
+
+    th.m_pool_state = ThreadsExec::Active ;
+
+    wait_yield( th.m_pool_state , ThreadsExec::Active );
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    s_threads_process.m_pool_state = ThreadsExec::Active ;
+    (*func)( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  s_current_function_arg = 0 ;
+  s_current_function = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void * ThreadsExec::root_reduce_scratch()
+{
+  return s_threads_process.reduce_memory();
+}
+
+void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
+{
+  if ( exec.m_scratch ) {
+    HostSpace::decrement( exec.m_scratch );
+    exec.m_scratch = 0 ;
+  }
+
+  exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
+  exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
+
+  if ( s_threads_process.m_scratch_thread_end ) {
+
+    exec.m_scratch =
+      HostSpace::allocate( "thread_scratch" , typeid(unsigned char) , 1 , s_threads_process.m_scratch_thread_end );
+
+    unsigned * ptr = (unsigned *)( exec.m_scratch );
+    unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
+
+    // touch on this thread
+    while ( ptr < end ) *ptr++ = 0 ;
+  }
+}
+
+void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
+{
+  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
+
+  fence();
+
+  const size_t old_reduce_size = s_threads_process.m_scratch_reduce_end ;
+  const size_t old_thread_size = s_threads_process.m_scratch_thread_end - s_threads_process.m_scratch_reduce_end ;
+
+  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+
+  // Increase size or deallocate completely.
+
+  if ( ( old_reduce_size < reduce_size ) ||
+       ( old_thread_size < thread_size ) ||
+       ( ( reduce_size == 0 && thread_size == 0 ) &&
+         ( old_reduce_size != 0 || old_thread_size != 0 ) ) ) {
+
+    verify_is_process( "ThreadsExec::resize_scratch" , true );
+
+    s_threads_process.m_scratch_reduce_end = reduce_size ;
+    s_threads_process.m_scratch_thread_end = reduce_size + thread_size ;
+
+    execute_serial( & execute_resize_scratch );
+
+    s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
+  }
+
+  return s_threads_process.m_scratch ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
+{
+  verify_is_process("ThreadsExec::print_configuration",false);
+
+  fence();
+
+  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  // Forestall compiler warnings for unused variables.
+  (void) numa_count;
+  (void) cores_per_numa;
+  (void) threads_per_core;
+
+  s << "Kokkos::Threads" ;
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  s << " KOKKOS_HAVE_PTHREAD" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+  s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
+#endif
+
+  if ( s_thread_pool_size[0] ) {
+    s << " threads[" << s_thread_pool_size[0] << "]"
+      << " threads_per_numa[" << s_thread_pool_size[1] << "]"
+      << " threads_per_core[" << s_thread_pool_size[2] << "]"
+      ;
+    if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; }
+    s << " ReduceScratch[" << s_current_reduce_size << "]"
+      << " SharedScratch[" << s_current_shared_size << "]" ;
+    s << std::endl ;
+
+    if ( detail ) {
+
+      execute_serial( & execute_get_binding );
+
+      for ( int i = 0 ; i < s_thread_pool_size[0] ; ++i ) {
+        ThreadsExec * const th = s_threads_exec[i] ;
+        s << "  Thread hwloc("
+          << s_threads_coord[i].first << "."
+          << s_threads_coord[i].second << ")" ;
+
+        s_threads_coord[i].first  = ~0u ;
+        s_threads_coord[i].second = ~0u ;
+
+        if ( th ) {
+          const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 );
+
+          s << " rank(" << th->m_pool_rank << ")" ;
+
+          if ( th->m_pool_fan_size ) {
+            s << " Fan{" ;
+            for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) {
+              s << " " << th->m_pool_base[rank_rev+(1<<j)]->m_pool_rank ;
+            }
+            s << " }" ;
+          }
+
+          if ( th == & s_threads_process ) {
+            s << " is_process" ;
+          }
+        }
+        s << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+int ThreadsExec::is_initialized()
+{ return 0 != s_threads_exec[0] ; }
+
+void ThreadsExec::initialize( unsigned thread_count ,
+                              unsigned use_numa_count ,
+                              unsigned use_cores_per_numa ,
+                              bool allow_asynchronous_threadpool )
+{
+  static const Sentinel sentinel ;
+
+  const bool is_initialized = 0 != s_thread_pool_size[0] ;
+
+  unsigned thread_spawn_failed = 0 ;
+
+  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
+    s_threads_exec[i] = NULL;
+
+  if ( ! is_initialized ) {
+
+    // If thread_count, use_numa_count, or use_cores_per_numa are zero
+    // then they will be given default values based upon hwloc detection
+    // and allowed asynchronous execution.
+
+    const bool hwloc_avail = hwloc::available();
+
+    const unsigned thread_spawn_begin =
+      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
+                             allow_asynchronous_threadpool ,
+                             thread_count ,
+                             use_numa_count ,
+                             use_cores_per_numa ,
+                             s_threads_coord );
+
+    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;
+
+    if ( thread_spawn_begin ) {
+      // Synchronous with s_threads_coord[0] as the process core
+      // Claim entry #0 for binding the process core.
+      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
+    }
+
+    s_thread_pool_size[0]    = thread_count ;
+    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
+    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
+    s_current_function = & execute_function_noop ; // Initialization work function
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+      // If hwloc available then spawned thread will
+      // choose its own entry in 's_threads_coord'
+      // otherwise specify the entry.
+      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith );
+
+      // Spawn thread executing the 'driver()' function.
+      // Wait until spawned thread has attempted to initialize.
+      // If spawning and initialization is successfull then
+      // an entry in 's_threads_exec' will be assigned.
+      if ( ThreadsExec::spawn() ) {
+        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+      }
+      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
+    }
+
+    // Wait for all spawned threads to deactivate before zeroing the function.
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+      // Try to protect against cache coherency failure by casting to volatile.
+      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
+      if ( th ) {
+        wait_yield( th->m_pool_state , ThreadsExec::Active );
+      }
+      else {
+        ++thread_spawn_failed ;
+      }
+    }
+
+    s_current_function     = 0 ;
+    s_current_function_arg = 0 ;
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+    if ( ! thread_spawn_failed ) {
+      // Bind process to the core on which it was located before spawning occured
+      Kokkos::hwloc::bind_this_thread( proc_coord );
+
+      if ( thread_spawn_begin ) { // Include process in pool.
+        s_threads_exec[0]                 = & s_threads_process ;
+        s_threads_process.m_pool_base     = s_threads_exec ;
+        s_threads_process.m_pool_rank     = thread_count - 1 ; // Reversed for scan-compatible reductions
+        s_threads_process.m_pool_size     = thread_count ;
+        s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
+      }
+      else {
+        s_threads_process.m_pool_base = 0 ;
+        s_threads_process.m_pool_rank = 0 ;
+        s_threads_process.m_pool_size = 0 ;
+        s_threads_process.m_pool_fan_size = 0 ;
+      }
+
+      // Initial allocations:
+      ThreadsExec::resize_scratch( 1024 , 1024 );
+    }
+    else {
+      s_thread_pool_size[0]    = 0 ;
+      s_thread_pool_size[1] = 0 ;
+      s_thread_pool_size[2] = 0 ;
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Threads::initialize ERROR" ;
+
+    if ( is_initialized ) {
+      msg << " : already initialized" ;
+    }
+    if ( thread_spawn_failed ) {
+      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::finalize()
+{
+  verify_is_process("ThreadsExec::finalize",false);
+
+  fence();
+
+  resize_scratch(0,0);
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) {
+
+    if ( s_threads_exec[i] ) {
+
+      s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ;
+
+      wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+    }
+
+    s_threads_pid[i] = 0 ;
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    ( & s_threads_process )->~ThreadsExec();
+    s_threads_exec[0] = 0 ;
+  }
+
+  Kokkos::hwloc::unbind_this_thread();
+
+  s_thread_pool_size[0] = 0 ;
+  s_thread_pool_size[1] = 0 ;
+  s_thread_pool_size[2] = 0 ;
+
+  // Reset master thread to run solo.
+  s_threads_process.m_pool_base     = 0 ;
+  s_threads_process.m_pool_rank     = 0 ;
+  s_threads_process.m_pool_size     = 1 ;
+  s_threads_process.m_pool_fan_size = 0 ;
+  s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+Threads & Threads::instance(int)
+{
+  static Threads * const t = 0 ;
+  return *t ;
+}
+
+int Threads::thread_pool_size( int depth )
+{
+  return Impl::s_thread_pool_size[depth];
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+int Threads::thread_pool_rank()
+{
+  const pthread_t pid = pthread_self();
+  int i = 0;
+  while ( ( i < Impl::s_thread_pool_size[0] ) && ( pid != Impl::s_threads_pid[i] ) ) { ++i ; }
+  return i ;
+}
+#endif
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD ) */
+
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@ -0,0 +1,709 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADSEXEC_HPP
+#define KOKKOS_THREADSEXEC_HPP
+
+#include <stdio.h>
+
+#include <utility>
+#include <impl/Kokkos_spinwait.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class > struct ThreadsExecAdapter ;
+
+//----------------------------------------------------------------------------
+
+class ThreadsExecTeamMember ;
+
+class ThreadsExec {
+public:
+
+  // Fan array has log_2(NT) reduction threads plus 2 scan threads
+  // Currently limited to 16k threads.
+  enum { MAX_FAN_COUNT    = 16 };
+  enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
+  enum { VECTOR_LENGTH    = 8 };
+
+  /** \brief States of a worker thread */
+  enum { Terminating ///<  Termination in progress
+       , Inactive    ///<  Exists, waiting for work
+       , Active      ///<  Exists, performing work
+       , Rendezvous  ///<  Exists, waiting in a barrier or reduce
+
+       , ScanCompleted
+       , ScanAvailable
+       , ReductionAvailable
+       };
+
+private:
+
+  friend class ThreadsExecTeamMember ;
+  friend class Kokkos::Threads ;
+
+  // Fan-in operations' root is the highest ranking thread
+  // to place the 'scan' reduction intermediate values on
+  // the threads that need them.
+  // For a simple reduction the thread location is arbitrary.
+
+  /** \brief  Reduction memory reserved for team reductions */
+  enum { REDUCE_TEAM_BASE = 512 };
+
+  ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
+
+  void        * m_scratch ;
+  int           m_scratch_reduce_end ;
+  int           m_scratch_thread_end ;
+  int           m_pool_rank ;
+  int           m_pool_size ;
+  int           m_pool_fan_size ;
+  int volatile  m_pool_state ;  ///< State for global synchronizations
+
+
+  static void global_lock();
+  static void global_unlock();
+  static bool spawn();
+
+  static void execute_resize_scratch( ThreadsExec & , const void * );
+  static void execute_sleep(          ThreadsExec & , const void * );
+  static void execute_get_binding(    ThreadsExec & , const void * );
+
+  ThreadsExec( const ThreadsExec & );
+  ThreadsExec & operator = ( const ThreadsExec & );
+
+  static void execute_serial( void (*)( ThreadsExec & , const void * ) );
+
+public:
+
+  KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size ; }
+  KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank ; }
+
+  static int get_thread_count();
+  static ThreadsExec * get_thread( const int init_thread_rank );
+
+  inline void * reduce_memory() const { return ((unsigned char *) m_scratch ); }
+  inline void * scratch_memory() const { return ((unsigned char *) m_scratch ) + m_scratch_reduce_end ; }
+
+  static void driver(void);
+
+  ~ThreadsExec();
+  ThreadsExec();
+
+  static void * resize_scratch( size_t reduce_size , size_t thread_size );
+
+  static void * root_reduce_scratch();
+
+  static bool is_process();
+
+  static void verify_is_process( const std::string & , const bool initialized );
+
+  static int is_initialized();
+
+  static void initialize( unsigned thread_count ,
+                          unsigned use_numa_count ,
+                          unsigned use_cores_per_numa ,
+                          bool allow_asynchronous_threadpool );
+
+  static void finalize();
+
+  /* Given a requested team size, return valid team size */
+  static unsigned team_size_valid( unsigned );
+
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //------------------------------------
+
+  static void wait_yield( volatile int & , const int );
+
+  //------------------------------------
+  // All-thread functions:
+
+  template< class Functor >
+  inline
+  void fan_in_reduce( const Functor & f ) const
+    {
+      typedef ReduceAdapter< Functor > Reduce ;
+
+      const int rev_rank  = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+
+        ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
+
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+
+        Reduce::join( f , reduce_memory() , fan.reduce_memory() );
+      }
+
+      if ( ! rev_rank ) {
+        Reduce::final( f , reduce_memory() );
+      }
+    }
+
+  inline
+  void fan_in() const
+    {
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
+      }
+    }
+
+  template< class FunctorType >
+  inline
+  void scan_large( const FunctorType & f )
+    {
+      // Sequence of states:
+      //  0) Active             : entry and exit state
+      //  1) ReductionAvailable : reduction value available
+      //  2) ScanAvailable      : inclusive scan value available
+      //  3) Rendezvous         : All threads inclusive scan value are available
+      //  4) ScanCompleted      : exclusive scan value copied
+
+      typedef ReduceAdapter< FunctorType > Reduce ;
+      typedef typename Reduce::scalar_type scalar_type ;
+
+      const int      rev_rank = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Reduce::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_memory();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
+
+        // Wait: Active -> ReductionAvailable (or ScanAvailable)
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        Reduce::join( f , work_value , fan.reduce_memory() );
+      }
+
+      // Copy reduction value to scan value before releasing from this phase.
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i] ; }
+
+      if ( rev_rank ) {
+
+        // Set: Active -> ReductionAvailable
+        m_pool_state = ThreadsExec::ReductionAvailable ;
+
+        // Wait for contributing threads' scan value to be available.
+        if ( ( 1 << m_pool_fan_size ) < ( m_pool_rank + 1 ) ) {
+          ThreadsExec & th = *m_pool_base[ rev_rank + ( 1 << m_pool_fan_size ) ] ;
+
+          // Wait: Active             -> ReductionAvailable
+          // Wait: ReductionAvailable -> ScanAvailable
+          Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
+
+          Reduce::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
+        }
+
+        // This thread has completed inclusive scan
+        // Set: ReductionAvailable -> ScanAvailable
+        m_pool_state = ThreadsExec::ScanAvailable ;
+
+        // Wait for all threads to complete inclusive scan
+        // Wait: ScanAvailable -> Rendezvous
+        Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
+      }
+
+      //--------------------------------
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
+        // Wait: ReductionAvailable -> ScanAvailable
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        // Set: ScanAvailable -> Rendezvous
+        fan.m_pool_state = ThreadsExec::Rendezvous ;
+      }
+
+      // All threads have completed the inclusive scan.
+      // All non-root threads are in the Rendezvous state.
+      // Threads are free to overwrite their reduction value.
+      //--------------------------------
+
+      if ( ( rev_rank + 1 ) < m_pool_size ) {
+        // Exclusive scan: copy the previous thread's inclusive scan value
+
+        ThreadsExec & th = *m_pool_base[ rev_rank + 1 ] ; // Not the root thread
+
+        const scalar_type * const src_value = ((scalar_type *)th.reduce_memory()) + count ;
+
+        for ( unsigned j = 0 ; j < count ; ++j ) { work_value[j] = src_value[j]; }
+      }
+      else {
+        (void) Reduce::init( f , work_value );
+      }
+
+      //--------------------------------
+      // Wait for all threads to copy previous thread's inclusive scan value
+      // Wait for all threads: Rendezvous -> ScanCompleted
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+      }
+      if ( rev_rank ) {
+        // Set: ScanAvailable -> ScanCompleted
+        m_pool_state = ThreadsExec::ScanCompleted ;
+        // Wait: ScanCompleted -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
+      }
+      // Set: ScanCompleted -> Active
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  template< class FunctorType >
+  inline
+  void scan_small( const FunctorType & f )
+    {
+      typedef ReduceAdapter< FunctorType > Reduce ;
+      typedef typename Reduce::scalar_type scalar_type ;
+
+      const int      rev_rank = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Reduce::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_memory();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the thread-scan before releasing threads
+
+        scalar_type * ptr_prev = 0 ;
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          scalar_type * const ptr = (scalar_type *) get_thread( rank )->reduce_memory();
+          if ( rank ) {
+            for ( unsigned i = 0 ; i < count ; ++i ) { ptr[i] = ptr_prev[ i + count ]; }
+            Reduce::join( f , ptr + count , ptr );
+          }
+          else {
+            (void) Reduce::init( f , ptr );
+          }
+          ptr_prev = ptr ;
+        }
+      }
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  //------------------------------------
+  /** \brief  Wait for previous asynchronous functor to
+   *          complete and release the Threads device.
+   *          Acquire the Threads device and start this functor.
+   */
+  static void start( void (*)( ThreadsExec & , const void * ) , const void * );
+
+/*
+  static unsigned team_max();
+  static unsigned team_recommended();
+  static unsigned hardware_thread_id();
+  static unsigned max_hardware_threads();
+*/
+
+  static int  in_parallel();
+  static void fence();
+  static bool sleep();
+  static bool wake();
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+class ThreadsExecTeamMember {
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  typedef Kokkos::Threads execution_space ;
+  typedef execution_space::scratch_memory_space space ;
+
+  Impl::ThreadsExec   & m_exec ;
+  space                 m_team_shared ;
+  ThreadsExec * const * m_team_base ; ///< Base for team fan-in
+  int                   m_team_shared_size ;
+  int                   m_team_size ;
+  int                   m_team_rank ;
+  int                   m_team_rank_rev ;
+  int                   m_league_size ;
+  int                   m_league_end ;
+  int                   m_league_rank ;
+
+  inline
+  void set_team_shared()
+    { new( & m_team_shared ) space( ((char *) (*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE , m_team_shared_size ); }
+  
+  // Fan-in and wait until the matching fan-out is called.
+  // The root thread which does not wait will return true.
+  // All other threads will return false during the fan-out.
+  KOKKOS_INLINE_FUNCTION bool team_fan_in() const
+    {
+      int n , j ;
+
+      // Wait for fan-in threads
+      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_team_base[j]->m_pool_state , ThreadsExec::Active );
+      }
+
+      // If not root then wait for release
+      if ( m_team_rank_rev ) {
+        m_exec.m_pool_state = ThreadsExec::Rendezvous ;
+        Impl::spinwait( m_exec.m_pool_state , ThreadsExec::Rendezvous );
+      }
+
+      return ! m_team_rank_rev ;
+    }
+
+  KOKKOS_INLINE_FUNCTION void team_fan_out() const
+    {
+      int n , j ;
+      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
+        m_team_base[j]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION static int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+    {
+      team_fan_in();
+      team_fan_out();
+    }
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(Type) < ThreadsExec::REDUCE_TEAM_BASE , Type , void >::type type ;
+
+      *((volatile type*) m_exec.scratch_memory() ) = value ;
+
+      memory_fence();
+
+      type & accum = *((type *) m_team_base[0]->scratch_memory() );
+
+      if ( team_fan_in() ) {
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          accum += *((type *) m_team_base[i]->scratch_memory() );
+        }
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return accum ;
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < ThreadsExec::REDUCE_TEAM_BASE , ArgType , void >::type type ;
+
+      volatile type * const work_value  = ((type*) m_exec.scratch_memory());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_team_base[i]->scratch_memory());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_team_base[i]->scratch_memory());
+          const type offset = accum ;
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value ) const
+    { return this-> template team_scan<ArgType>( value , 0 ); }
+
+  //----------------------------------------
+  // Private for the driver
+
+  template< class WorkArgTag >
+  ThreadsExecTeamMember( Impl::ThreadsExec & exec
+                       , const TeamPolicy< execution_space , WorkArgTag > & team 
+                       , const int shared_size )
+    : m_exec( exec )
+    , m_team_shared(0,0)
+    , m_team_base(0)
+    , m_team_shared_size( shared_size )
+    , m_team_size(0)
+    , m_team_rank(0)
+    , m_team_rank_rev(0)
+    , m_league_size(0)
+    , m_league_end(0)
+    , m_league_rank(0)
+    {
+      if ( team.league_size() ) {
+        // Execution is using device-team interface:
+
+        const int pool_rank_rev = exec.pool_size() - ( exec.pool_rank() + 1 );
+        const int team_rank_rev = pool_rank_rev % team.team_alloc();
+
+        // May be using fewer threads per team than a multiple of threads per core,
+        // some threads will idle.
+
+        if ( team_rank_rev < team.team_size() ) {
+          const size_t pool_league_size     = exec.pool_size() / team.team_alloc() ;
+          const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
+          const size_t pool_league_rank     = pool_league_size - ( pool_league_rank_rev + 1 );
+
+          m_team_base        = exec.m_pool_base + team.team_alloc() * pool_league_rank_rev ;
+          m_team_size        = team.team_size() ;
+          m_team_rank        = team.team_size() - ( team_rank_rev + 1 );
+          m_team_rank_rev    = team_rank_rev ;
+          m_league_size      = team.league_size();
+          m_league_rank      = ( team.league_size() *  pool_league_rank    ) / pool_league_size ;
+          m_league_end       = ( team.league_size() * (pool_league_rank+1) ) / pool_league_size ;
+
+          set_team_shared();
+        }
+      }
+    }
+
+  bool valid() const
+    { return m_league_rank < m_league_end ; }
+
+  void next()
+    {
+      if ( ++m_league_rank < m_league_end ) {
+        team_barrier();
+        set_team_shared();
+      }
+    }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline int Threads::in_parallel()
+{ return Impl::ThreadsExec::in_parallel(); }
+
+inline int Threads::is_initialized()
+{ return Impl::ThreadsExec::is_initialized(); }
+
+inline void Threads::initialize(
+  unsigned threads_count ,
+  unsigned use_numa_count ,
+  unsigned use_cores_per_numa ,
+  bool allow_asynchronous_threadpool )
+{
+  Impl::ThreadsExec::initialize( threads_count , use_numa_count , use_cores_per_numa , allow_asynchronous_threadpool );
+}
+
+inline void Threads::finalize()
+{
+  Impl::ThreadsExec::finalize();
+}
+
+inline void Threads::print_configuration( std::ostream & s , const bool detail )
+{
+  Impl::ThreadsExec::print_configuration( s , detail );
+}
+
+inline bool Threads::sleep()
+{ return Impl::ThreadsExec::sleep() ; }
+
+inline bool Threads::wake()
+{ return Impl::ThreadsExec::wake() ; }
+
+inline void Threads::fence()
+{ Impl::ThreadsExec::fence() ; }
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template < class WorkArgTag >
+class TeamPolicy< Kokkos::Threads , WorkArgTag > {
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+
+  inline
+  void init( const int league_size_request 
+           , const int team_size_request )
+   {
+      const int pool_size  = execution_space::thread_pool_size(0);
+      const int team_max   = execution_space::thread_pool_size(1);
+      const int team_grain = execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+   }
+
+
+public:
+
+  typedef Impl::ExecutionPolicyTag   kokkos_tag ;      ///< Concept tag
+  typedef Kokkos::Threads            execution_space ; ///< Execution space
+
+  inline int team_size() const { return m_team_size ; }
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size_request , int team_size_request )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    { init(league_size_request,team_size_request); }
+
+  TeamPolicy( int league_size_request , int team_size_request )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    { init(league_size_request,team_size_request); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return execution_space::thread_pool_size(1); }
+
+  typedef Impl::ThreadsExecTeamMember member_type ;
+
+  friend class Impl::ThreadsExecTeamMember ;
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADSEXEC_HPP */
+
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@ -0,0 +1,254 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+/* Standard 'C' Linux libraries */
+
+#include <pthread.h>
+#include <sched.h>
+#include <errno.h>
+
+/* Standard C++ libaries */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Threads.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER ;
+
+// Pthreads compatible driver.
+// Recovery from an exception would require constant intra-thread health
+// verification; which would negatively impact runtime.  As such simply
+// abort the process.
+
+void * internal_pthread_driver( void * )
+{
+  try {
+    ThreadsExec::driver();
+  }
+  catch( const std::exception & x ) {
+    std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  catch( ... ) {
+    std::cerr << "Exception thrown from worker thread" << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  return NULL ;
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+// Spawn a thread
+
+bool ThreadsExec::spawn()
+{
+  bool result = false ;
+
+  pthread_attr_t attr ;
+
+  if ( 0 == pthread_attr_init( & attr ) ||
+       0 == pthread_attr_setscope(       & attr, PTHREAD_SCOPE_SYSTEM ) ||
+       0 == pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
+
+    pthread_t pt ;
+
+    result = 0 == pthread_create( & pt, & attr, internal_pthread_driver, 0 );
+  }
+
+  pthread_attr_destroy( & attr );
+
+  return result ;
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::is_process()
+{
+  static const pthread_t master_pid = pthread_self();
+
+  return pthread_equal( master_pid , pthread_self() );
+}
+
+void ThreadsExec::global_lock()
+{
+  pthread_mutex_lock( & host_internal_pthread_mutex );
+}
+
+void ThreadsExec::global_unlock()
+{
+  pthread_mutex_unlock( & host_internal_pthread_mutex );
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value )
+{
+  while ( value == flag ) { sched_yield(); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/* end #if defined( KOKKOS_HAVE_PTHREAD ) */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_HAVE_WINTHREAD )
+
+/* Windows libraries */
+#include <windows.h>
+#include <process.h>
+
+/* Standard C++ libaries */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Threads.hpp>
+
+//----------------------------------------------------------------------------
+// Driver for each created pthread
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+unsigned WINAPI internal_winthread_driver( void * arg )
+{
+  ThreadsExec::driver();
+
+  return 0 ;
+}
+
+class ThreadLockWindows {
+private:
+  CRITICAL_SECTION  m_handle ;
+
+  ~ThreadLockWindows()
+  { DeleteCriticalSection( & m_handle ); }
+
+  ThreadLockWindows();
+  { InitializeCriticalSection( & m_handle ); }
+
+  ThreadLockWindows( const ThreadLockWindows & );
+  ThreadLockWindows & operator = ( const ThreadLockWindows & );
+
+public:
+
+  static ThreadLockWindows & singleton();
+
+  void lock()
+  { EnterCriticalSection( & m_handle ); }
+
+  void unlock()
+  { LeaveCriticalSection( & m_handle ); }
+};
+
+ThreadLockWindows & ThreadLockWindows::singleton()
+{ static ThreadLockWindows self ; return self ; }
+
+} // namespace <>
+} // namespace Kokkos
+} // namespace Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Spawn this thread
+
+bool ThreadsExec::spawn()
+{
+  unsigned Win32ThreadID = 0 ;
+
+  HANDLE handle =
+    _beginthreadex(0,0,internal_winthread_driver,0,0, & Win32ThreadID );
+
+  return ! handle ;
+}
+
+bool ThreadsExec::is_process() { return true ; }
+
+void ThreadsExec::global_lock()
+{ ThreadLockWindows::singleton().lock(); }
+
+void ThreadsExec::global_unlock()
+{ ThreadLockWindows::singleton().unlock(); }
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
+{
+  while ( value == flag ) { Sleep(0); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* end #elif defined( KOKKOS_HAVE_WINTHREAD ) */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+
+
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@ -0,0 +1,327 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_PARALLEL_HPP
+#define KOKKOS_THREADS_PARALLEL_HPP
+
+#include <vector>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Kokkos::Threads , void , IntType , P >
+                 , Kokkos::Threads
+                 >
+{
+public:
+
+  typedef Kokkos::RangePolicy< Kokkos::Threads , void , IntType , P > Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    const Policy range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    const typename Policy::member_type e = range.end();
+    for ( typename Policy::member_type i = range.begin() ; i < e ; ++i ) {
+      self.m_func( i );
+    }
+
+    exec.fan_in();
+  }
+
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      ThreadsExec::start( & ParallelFor::execute , this );
+
+      ThreadsExec::fence();
+    }
+};
+
+template< class FunctorType >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Kokkos::Threads , void > , Kokkos::Threads >
+{
+public:
+
+  typedef TeamPolicy< Kokkos::Threads , void >  Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+  const int          m_shared ;
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    // TODO: Add thread pool queries to ThreadExec.
+    // TODO: Move all of the team state out of ThreadsExec and into the Policy.
+
+    typename Policy::member_type member( exec , self.m_policy , self.m_shared );
+
+    for ( ; member.valid() ; member.next() ) {
+      self.m_func( member );
+    }
+
+    exec.fan_in();
+  }
+
+  ParallelFor( const FunctorType & functor
+              , const Policy      & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+    {
+      ThreadsExec::resize_scratch( 0 , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelFor::execute , this );
+
+      ThreadsExec::fence();
+    }
+
+  inline void wait() {}
+
+  inline ~ParallelFor() { wait(); }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Kokkos::Threads , void , IntType , P >
+                    , Kokkos::Threads
+                    >
+{
+public:
+
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+  typedef Kokkos::RangePolicy< Kokkos::Threads , void , IntType , P > Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    // Initialize thread-local value
+    typename Reduce::reference_type update = Reduce::init( self.m_func , exec.reduce_memory() );
+
+    const Policy range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    const typename Policy::member_type e = range.end();
+    for ( typename Policy::member_type i = range.begin() ; i < e ; ++i ) {
+      self.m_func( i , update );
+    }
+
+    exec.fan_in_reduce( self.m_func );
+  }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & functor ,
+                  const Policy       & policy ,
+                  const HostViewType & result_view )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      ThreadsExec::resize_scratch( Reduce::value_size( m_func ) , 0 );
+
+      ThreadsExec::start( & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
+
+      ThreadsExec::fence();
+
+      if ( result_view.ptr_on_device() ) {
+        const unsigned n = Reduce::value_count( m_func );
+        for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
+      }
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Kokkos::Threads , void > , Kokkos::Threads >
+{
+public:
+
+  typedef TeamPolicy< Kokkos::Threads , void >  Policy ;
+  typedef ReduceAdapter< FunctorType >          Reduce ;
+  typedef typename Reduce::pointer_type         pointer_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+  const int          m_shared ;
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    // Initialize thread-local value
+    typename Reduce::reference_type update = Reduce::init( self.m_func , exec.reduce_memory() );
+
+    typename Policy::member_type member( exec , self.m_policy , self.m_shared );
+    for ( ; member.valid() ; member.next() ) {
+      self.m_func( member , update );
+    }
+
+    exec.fan_in_reduce( self.m_func );
+  }
+
+  ParallelReduce( const FunctorType & functor
+                , const Policy      & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+    {
+      ThreadsExec::resize_scratch( Reduce::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelReduce::execute , this );
+
+      ThreadsExec::fence();
+    }
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType & functor
+                , const Policy      & policy
+                , const ViewType    & result )
+    : m_func( functor )
+    , m_policy( policy )
+    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+    {
+      ThreadsExec::resize_scratch( Reduce::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
+
+      ThreadsExec::fence();
+
+      const unsigned n = Reduce::value_count( m_func );
+      for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
+    }
+
+  inline void wait() {}
+
+  inline ~ParallelReduce() { wait(); }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , typename IntType , unsigned P >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Kokkos::Threads , void , IntType , P >
+                  , Kokkos::Threads
+                  >
+{
+public:
+
+  typedef ReduceAdapter< FunctorType > Reduce ;
+  typedef typename Reduce::pointer_type pointer_type ;
+  typedef Kokkos::RangePolicy< Kokkos::Threads , void , IntType , P > Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+
+    const Policy range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    typename Reduce::reference_type update = Reduce::init( self.m_func , exec.reduce_memory() );
+
+    const typename Policy::member_type e = range.end();
+    for ( typename Policy::member_type i = range.begin() ; i < e ; ++i ) {
+      self.m_func( i , update , false );
+    }
+
+    //  exec.scan_large( self.m_func );
+    exec.scan_small( self.m_func );
+
+    for ( typename Policy::member_type i = range.begin() ; i < e ; ++i ) {
+      self.m_func( i , update , true );
+    }
+
+    exec.fan_in();
+  }
+
+  ParallelScan( const FunctorType & functor , const Policy & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      ThreadsExec::resize_scratch( 2 * Reduce::value_size( m_func ) , 0 );
+      ThreadsExec::start( & ParallelScan::execute , this );
+      ThreadsExec::fence();
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
@ -0,0 +1,258 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ANALYZESHAPE_HPP
+#define KOKKOS_ANALYZESHAPE_HPP
+
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+/** \brief  Analyze the array shape defined by a Kokkos::View data type.
+ *
+ *  It is presumed that the data type can be mapped down to a multidimensional
+ *  array of an intrinsic scalar numerical type (double, float, int, ... ).
+ *  The 'value_type' of an array may be an embedded aggregate type such
+ *  as a fixed length array 'Array<T,N>'.  In this case the 'array_type'
+ *  represents the underlying array of intrinsic scalar type.
+ *
+ *  The embedded aggregate type must have an AnalyzeShape specialization
+ *  to map it down to a shape and intrinsic scalar numerical type.
+ */
+
+template< class T >
+struct AnalyzeShape : public Shape< sizeof(T) , 0 >
+{
+  typedef void specialize ;
+
+  typedef Shape< sizeof(T), 0 >  shape ;
+
+  typedef       T  array_type ;
+  typedef       T  value_type ;
+  typedef       T  type ;
+  typedef const T  const_array_type ;
+  typedef const T  const_value_type ;
+  typedef const T  const_type ;
+  typedef       T  non_const_array_type ;
+  typedef       T  non_const_value_type ;
+  typedef       T  non_const_type ;
+};
+
+template<>
+struct AnalyzeShape<void> : public Shape< 0 , 0 >
+{
+  typedef void specialize ;
+
+  typedef Shape< 0 , 0 >  shape ;
+
+  typedef       void  array_type ;
+  typedef       void  value_type ;
+  typedef       void  type ;
+  typedef const void  const_array_type ;
+  typedef const void  const_value_type ;
+  typedef const void  const_type ;
+  typedef       void  non_const_array_type ;
+  typedef       void  non_const_value_type ;
+  typedef       void  non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< const T > : public AnalyzeShape<T>::shape
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename nested::shape shape ;
+
+  typedef typename nested::const_array_type  array_type ;
+  typedef typename nested::const_value_type  value_type ;
+  typedef typename nested::const_type        type ;
+
+  typedef typename nested::const_array_type  const_array_type ;
+  typedef typename nested::const_value_type  const_value_type ;
+  typedef typename nested::const_type        const_type ;
+
+  typedef typename nested::non_const_array_type  non_const_array_type ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+  typedef typename nested::non_const_type        non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< T * >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_type * array_type ;
+  typedef typename nested::value_type   value_type ;
+  typedef typename nested::type       * type ;
+
+  typedef typename nested::const_array_type * const_array_type ;
+  typedef typename nested::const_value_type   const_value_type ;
+  typedef typename nested::const_type       * const_type ;
+
+  typedef typename nested::non_const_array_type * non_const_array_type ;
+  typedef typename nested::non_const_value_type   non_const_value_type ;
+  typedef typename nested::non_const_type       * non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< T[] >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_type  array_type [] ;
+  typedef typename nested::value_type  value_type ;
+  typedef typename nested::type        type [] ;
+
+  typedef typename nested::const_array_type  const_array_type [] ;
+  typedef typename nested::const_value_type  const_value_type ;
+  typedef typename nested::const_type        const_type [] ;
+
+  typedef typename nested::non_const_array_type  non_const_array_type [] ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+  typedef typename nested::non_const_type        non_const_type [] ;
+};
+
+template< class T >
+struct AnalyzeShape< const T[] >
+  : public ShapeInsert< typename AnalyzeShape< const T >::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape< const T > nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_type  array_type [] ;
+  typedef typename nested::value_type  value_type ;
+  typedef typename nested::type        type [] ;
+
+  typedef typename nested::const_array_type  const_array_type [] ;
+  typedef typename nested::const_value_type  const_value_type ;
+  typedef typename nested::const_type        const_type [] ;
+
+  typedef typename nested::non_const_array_type  non_const_array_type [] ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+  typedef typename nested::non_const_type        non_const_type [] ;
+};
+
+template< class T , unsigned N >
+struct AnalyzeShape< T[N] >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , N >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_type  array_type [N] ;
+  typedef typename nested::value_type  value_type ;
+  typedef typename nested::type        type [N] ;
+
+  typedef typename nested::const_array_type  const_array_type [N] ;
+  typedef typename nested::const_value_type  const_value_type ;
+  typedef typename nested::const_type        const_type [N] ;
+
+  typedef typename nested::non_const_array_type  non_const_array_type [N] ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+  typedef typename nested::non_const_type        non_const_type [N] ;
+};
+
+template< class T , unsigned N >
+struct AnalyzeShape< const T[N] >
+  : public ShapeInsert< typename AnalyzeShape< const T >::shape , N >::type
+{
+private:
+  typedef AnalyzeShape< const T > nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_type  array_type [N] ;
+  typedef typename nested::value_type  value_type ;
+  typedef typename nested::type        type [N] ;
+
+  typedef typename nested::const_array_type  const_array_type [N] ;
+  typedef typename nested::const_value_type  const_value_type ;
+  typedef typename nested::const_type        const_type [N] ;
+
+  typedef typename nested::non_const_array_type  non_const_array_type [N] ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+  typedef typename nested::non_const_type        non_const_type [N] ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_ANALYZESHAPE_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp
@ -0,0 +1,140 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_X86_HPP )
+#define KOKKOS_ATOMIC_ASSEMBLY_X86_HPP
+namespace Kokkos {
+
+#ifndef __CUDA_ARCH__
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<char>(volatile char* a) {
+  __asm__ __volatile__(
+    "lock incb %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<short>(volatile short* a) {
+  __asm__ __volatile__(
+    "lock incw %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<int>(volatile int* a) {
+  __asm__ __volatile__(
+    "lock incl %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<long long int>(volatile long long int* a) {
+  __asm__ __volatile__(
+    "lock incq %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<char>(volatile char* a) {
+  __asm__ __volatile__(
+    "lock decb %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<short>(volatile short* a) {
+  __asm__ __volatile__(
+    "lock decw %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<int>(volatile int* a) {
+  __asm__ __volatile__(
+    "lock decl %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<long long int>(volatile long long int* a) {
+  __asm__ __volatile__(
+    "lock decq %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+#endif
+}
+
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@ -0,0 +1,173 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
+#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+// Cuda native CAS supports int, unsigned int, and unsigned long long int (non-standard type).
+// Must cast-away 'volatile' for the CAS call.
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+__inline__ __device__
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return atomicCAS((int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val)
+{ return atomicCAS((unsigned int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned long long int atomic_compare_exchange( volatile unsigned long long int * const dest ,
+                                                const unsigned long long int compare ,
+                                                const unsigned long long int val )
+{ return atomicCAS((unsigned long long int*)dest,compare,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  const int tmp = atomicCAS( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  const type tmp = atomicCAS( (type*) dest , *((type*)&compare) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+//----------------------------------------------------------------------------
+// GCC native CAS supports int, long, unsigned int, unsigned long.
+// Intel native CAS support int and long with the same interface as GCC.
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+KOKKOS_INLINE_FUNCTION
+long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+// GCC supports unsigned
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long atomic_compare_exchange( volatile unsigned long * const dest ,
+                                       const unsigned long compare ,
+                                       const unsigned long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  union { int i ; T t ; } tmp ;
+  tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return tmp.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T & >::type val )
+{
+  union { long i ; T t ; } tmp ;
+  tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
+  return tmp.t ;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T compare, const T val )
+{
+  T retval;
+#pragma omp critical
+  {
+    retval = dest[0];
+    if ( retval == compare )
+  	dest[0] = val;
+  }
+  return retval;
+}
+
+#endif
+
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+{
+  return compare == atomic_compare_exchange(dest, compare, val);
+}
+
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+#endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@ -0,0 +1,210 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
+#define KOKKOS_ATOMIC_EXCHANGE_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+__inline__ __device__
+int atomic_exchange( volatile int * const dest , const int val )
+{
+  // return __iAtomicExch( (int*) dest , val );
+  return atomicExch( (int*) dest , val );
+}
+
+__inline__ __device__
+unsigned int atomic_exchange( volatile unsigned int * const dest , const unsigned int val )
+{
+  // return __uAtomicExch( (unsigned int*) dest , val );
+  return atomicExch( (unsigned int*) dest , val );
+}
+
+__inline__ __device__
+unsigned long long int atomic_exchange( volatile unsigned long long int * const dest , const unsigned long long int val )
+{
+  // return __ullAtomicExch( (unsigned long long*) dest , val );
+  return atomicExch( (unsigned long long*) dest , val );
+}
+
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
+  int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
+  type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // (void) __ullAtomicExch( (int*) dest , *((int*)&val) );
+  (void) atomicExch( ((int*)dest) , *((int*)&val) );
+}
+
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // (void) __ullAtomicExch( (type*) dest , *((type*)&val) );
+  (void) atomicExch( ((type*)dest) , *((type*)&val) );
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+
+  type assumed ;
+
+  union { T val_T ; type val_type ; } old ;
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+
+  return old.val_T ;
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+
+  type assumed ;
+
+  union { T val_T ; type val_type ; } old ;
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest , const T val )
+{
+  T retval;
+//#pragma omp atomic capture
+  #pragma omp critical
+  {
+    retval = dest[0];
+    dest[0] = val;
+  }
+  return retval;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest , const T val )
+{
+//#pragma omp atomic
+  #pragma omp critical
+  {
+    dest[0] = val;
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+#endif
+
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@ -0,0 +1,214 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
+#define KOKKOS_ATOMIC_FETCH_ADD_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return atomicAdd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAdd((unsigned int*)dest,val); }
+
+__inline__ __device__
+unsigned long long int atomic_fetch_add( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAdd((unsigned long long int*)dest,val); }
+
+__inline__ __device__
+float atomic_fetch_add( volatile float * const dest , const float val )
+{ return atomicAdd((float*)dest,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
+  } while ( assumed.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_add( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T >::type val )
+{
+  union { long i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_add( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] += val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_add without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T * const dest, const T src) {
+  atomic_fetch_add(dest,src);
+}
+
+// Atomic increment
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* a) {
+  Kokkos::atomic_fetch_add(a,1);
+}
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* a) {
+  Kokkos::atomic_fetch_add(a,-1);
+}
+
+}
+
+#include<impl/Kokkos_Atomic_Assembly_X86.hpp>
+#endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
+#define KOKKOS_ATOMIC_FETCH_AND_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return atomicAnd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAnd((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_and( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAnd((unsigned long long int*)dest,val); }
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_and( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_and( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] &= val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_and without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_and(volatile T * const dest, const T src) {
+  (void)atomic_fetch_and(dest,src);
+}
+
+}
+
+#endif
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
+#define KOKKOS_ATOMIC_FETCH_OR_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return atomicOr((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicOr((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_or( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicOr((unsigned long long int*)dest,val); }
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_or( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_or( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] |= val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_or without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_or(volatile T * const dest, const T src) {
+  (void)atomic_fetch_or(dest,src);
+}
+
+}
+
+#endif
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@ -0,0 +1,383 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
+#define KOKKOS_ATOMIC_GENERIC_HPP
+#include <Kokkos_Macros.hpp>
+
+// Combination operands to be used in an Compare and Exchange based atomic operation
+namespace Kokkos {
+namespace Impl {
+
+template<class Scalar1, class Scalar2>
+struct AddOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1+val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct SubOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1-val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct MulOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1*val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct DivOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1/val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct ModOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1%val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct AndOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1&val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct OrOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1|val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct XorOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1^val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct LShiftOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1<<val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct RShiftOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1>>val2;
+  }
+};
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int), const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+/*template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(short) , const T >::type val )
+{
+  union { short i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (short*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(short), const T >::type val )
+{
+  union { short i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (short*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(char) , const T >::type val )
+{
+  union { char i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (char*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(char), const T >::type val )
+{
+  union { char i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (char*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}*/
+
+}
+}
+
+namespace Kokkos {
+
+// Fetch_Oper atomics: return value before operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MulOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::DivOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::ModOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::AndOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::OrOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::XorOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(Impl::LShiftOper<T,const unsigned int>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(Impl::RShiftOper<T,const unsigned int>(),dest,val);
+}
+
+
+// Oper Fetch atomics: return value after operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MulOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::DivOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::ModOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::AndOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::OrOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::XorOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(Impl::LShiftOper<T,const unsigned int>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
+}
+
+
+}
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@ -0,0 +1,442 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ATOMIC_VIEW_HPP
+#define KOKKOS_ATOMIC_VIEW_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+namespace Kokkos {
+namespace Impl {
+
+template<class ViewTraits>
+class AtomicDataElement {
+public:
+  typedef typename ViewTraits::value_type value_type;
+  typedef typename ViewTraits::const_value_type const_value_type;
+  typedef typename ViewTraits::non_const_value_type non_const_value_type;
+  volatile value_type* const ptr;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicDataElement(value_type* ptr_):ptr(ptr_){}
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator = (const_value_type& val) const {
+    *ptr = val;
+    return val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator = (volatile const_value_type& val) const {
+    *ptr = val;
+    return val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void inc() const {
+    Kokkos::atomic_increment(ptr);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void dec() const {
+    Kokkos::atomic_decrement(ptr);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ++ () const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,1);
+    return tmp+1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -- () const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-1);
+    return tmp-1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ++ (int) const {
+    return Kokkos::atomic_fetch_add(ptr,1);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -- (int) const {
+    return Kokkos::atomic_fetch_add(ptr,-1);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator += (const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
+    return tmp+val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator += (volatile const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
+    return tmp+val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -= (const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
+    return tmp-val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -= (volatile const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
+    return tmp-val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator *= (const_value_type& val) const {
+    return Kokkos::atomic_mul_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator *= (volatile const_value_type& val) const {
+    return Kokkos::atomic_mul_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator /= (const_value_type& val) const {
+    return Kokkos::atomic_div_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator /= (volatile const_value_type& val) const {
+    return Kokkos::atomic_div_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator %= (const_value_type& val) const {
+    return Kokkos::atomic_mod_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator %= (volatile const_value_type& val) const {
+    return Kokkos::atomic_mod_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator &= (const_value_type& val) const {
+    return Kokkos::atomic_and_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator &= (volatile const_value_type& val) const {
+    return Kokkos::atomic_and_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^= (const_value_type& val) const {
+    return Kokkos::atomic_xor_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^= (volatile const_value_type& val) const {
+    return Kokkos::atomic_xor_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator |= (const_value_type& val) const {
+    return Kokkos::atomic_or_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator |= (volatile const_value_type& val) const {
+    return Kokkos::atomic_or_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator <<= (const_value_type& val) const {
+    return Kokkos::atomic_lshift_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator <<= (volatile const_value_type& val) const {
+    return Kokkos::atomic_lshift_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >>= (const_value_type& val) const {
+    return Kokkos::atomic_rshift_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >>= (volatile const_value_type& val) const {
+    return Kokkos::atomic_rshift_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator + (const_value_type& val) const {
+    return *ptr+val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator + (volatile const_value_type& val) const {
+    return *ptr+val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator - (const_value_type& val) const {
+    return *ptr-val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator - (volatile const_value_type& val) const {
+    return *ptr-val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator * (const_value_type& val) const {
+    return *ptr*val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator * (volatile const_value_type& val) const {
+    return *ptr*val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator / (const_value_type& val) const {
+    return *ptr/val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator / (volatile const_value_type& val) const {
+    return *ptr/val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator % (const_value_type& val) const {
+    return *ptr^val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator % (volatile const_value_type& val) const {
+    return *ptr^val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ! () const {
+    return !*ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator && (const_value_type& val) const {
+    return *ptr&&val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator && (volatile const_value_type& val) const {
+    return *ptr&&val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator || (const_value_type& val) const {
+    return *ptr|val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator || (volatile const_value_type& val) const {
+    return *ptr|val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator & (const_value_type& val) const {
+    return *ptr&val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator & (volatile const_value_type& val) const {
+    return *ptr&val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator | (const_value_type& val) const {
+    return *ptr|val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator | (volatile const_value_type& val) const {
+    return *ptr|val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^ (const_value_type& val) const {
+    return *ptr^val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^ (volatile const_value_type& val) const {
+    return *ptr^val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ~ () const {
+    return ~*ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator << (const unsigned int& val) const {
+    return *ptr<<val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator << (volatile const unsigned int& val) const {
+    return *ptr<<val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >> (const unsigned int& val) const {
+    return *ptr>>val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >> (volatile const unsigned int& val) const {
+    return *ptr>>val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const_value_type& val) const {
+    return *ptr == val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (volatile const_value_type& val) const {
+    return *ptr == val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const_value_type& val) const {
+    return *ptr != val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (volatile const_value_type& val) const {
+    return *ptr != val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator >= (const_value_type& val) const {
+    return *ptr >= val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator >= (volatile const_value_type& val) const {
+    return *ptr >= val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator <= (const_value_type& val) const {
+    return *ptr <= val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator <= (volatile const_value_type& val) const {
+    return *ptr <= val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator < (const_value_type& val) const {
+    return *ptr < val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator < (volatile const_value_type& val) const {
+    return *ptr < val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator > (const_value_type& val) const {
+    return *ptr > val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator > (volatile const_value_type& val) const {
+    return *ptr > val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const_value_type () const {
+    //return Kokkos::atomic_load(ptr);
+    return *ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator volatile non_const_value_type () volatile const {
+    //return Kokkos::atomic_load(ptr);
+    return *ptr;
+  }
+};
+
+template<class ViewTraits>
+class AtomicViewDataHandle {
+public:
+  typename ViewTraits::value_type* ptr;
+  KOKKOS_INLINE_FUNCTION
+  AtomicViewDataHandle(typename ViewTraits::value_type* ptr_):ptr(ptr_){}
+
+  template<class iType>
+  KOKKOS_INLINE_FUNCTION
+  AtomicDataElement<ViewTraits> operator[] (const iType& i) const {
+    return AtomicDataElement<ViewTraits>(ptr+i);
+  }
+};
+
+template<unsigned Size>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars;
+
+template<>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<4> {
+  typedef int type;
+};
+
+template<>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
+  typedef int64_t type;
+};
+
+template<class ViewTraits>
+class ViewDataHandle<ViewTraits,
+typename enable_if<(!is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
+                   (ViewTraits::memory_traits::Atomic) >::type> {
+//  typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) || 
+//                        (sizeof(typename ViewTraits::const_value_type)==8), 
+//                         int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type 
+//                   atomic_view_possible; 
+  typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
+  typedef ViewDataHandle self_type;
+
+public:
+  enum {ReferenceAble = 0};
+  typedef Impl::AtomicViewDataHandle<ViewTraits> type;
+  typedef Impl::AtomicDataElement<ViewTraits> return_type;
+
+  static type allocate(std::string label, size_t count) {
+    return type((typename ViewTraits::value_type*)
+                ViewTraits::memory_space::allocate( label ,
+                typeid(typename ViewTraits::value_type) ,
+                sizeof(typename ViewTraits::value_type) ,
+                count ));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static typename ViewTraits::value_type* get_raw_ptr(type handle) {
+    return handle.ptr;
+  }
+};
+}
+}
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Compiler_Macros.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Compiler_Macros.hpp
@ -0,0 +1,274 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_COMPILER_MACROS_HPP
+#define KOKKOS_COMPILER_MACROS_HPP
+
+/*--------------------------------------------------------------------------*/
+/* Language info: C++, CUDA, OPENMP */
+
+#if defined( __CUDA_ARCH__ )
+  // Compiling Cuda code to 'ptx'
+
+  #define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
+  #define KOKKOS_INLINE_FUNCTION       __device__  __host__  inline
+  #define KOKKOS_FUNCTION              __device__  __host__
+
+#endif /* #if defined( __CUDA_ARCH__ ) */
+
+#if defined( _OPENMP )
+
+  /*  Compiling with OpenMP.
+   *  The value of _OPENMP is an integer value YYYYMM
+   *  where YYYY and MM are the year and month designation
+   *  of the supported OpenMP API version.
+   */
+
+#endif /* #if defined( _OPENMP ) */
+
+/*--------------------------------------------------------------------------*/
+/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
+
+#if defined( __NVCC__ )
+  // NVIDIA compiler is being used.
+  // Code is parsed and separated into host and device code.
+  // Host code is compiled again with another compiler.
+  // Device code is compile to 'ptx'.
+  #define KOKKOS_COMPILER_NVCC __NVCC__
+
+  #if defined( KOKKOS_HAVE_CXX11 )
+  #error "NVCC does not support C++11"
+  #endif
+
+#endif /* #if defined( __NVCC__ ) */
+
+
+#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
+
+#if defined( __INTEL_COMPILER )
+  #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
+#elif defined( __ICC )
+  // Old define
+  #define KOKKOS_COMPILER_INTEL __ICC
+#elif defined( __ECC ) 
+  // Very old define
+  #define KOKKOS_COMPILER_INTEL __ECC
+#endif
+
+#if defined( _CRAYC )
+  #define KOKKOS_COMPILER_CRAYC _CRAYC
+#endif
+
+#if defined( __IBMCPP__ )
+  // IBM C++
+  #define KOKKOS_COMPILER_IBM __IBMCPP__
+#elif defined( __IBMC__ )
+  #define KOKKOS_COMPILER_IBM __IBMC__
+#endif
+
+#if defined( __APPLE_CC__ )
+  #define KOKKOS_COMPILER_APPLECC __APPLE_CC__
+#endif
+
+#if defined( __clang__ )
+  #define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
+#endif
+
+#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
+  #define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
+#endif
+
+#if defined( __PGIC__ ) && ! defined( __GNUC__ )
+  #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
+#endif
+
+#endif /* #if ! defined( __CUDA_ARCH__ ) */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Intel compiler macros */
+
+#if defined( KOKKOS_COMPILER_INTEL )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  #define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  #define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+  #if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_ENABLE_ASM )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+
+  #define KOKKOS_FORCEINLINE_FUNCTION  __forceinline
+
+  #if defined( __MIC__ )
+    // Compiling for Xeon Phi
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* Cray compiler macros */
+
+#if defined( KOKKOS_COMPILER_CRAYC )
+
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* IBM Compiler macros */
+
+#if defined( KOKKOS_COMPILER_IBM )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_COMPILER_CLANG )
+
+  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+  #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_COMPILER_GNU ) 
+
+  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+  #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
+
+  #if ! defined( KOKKOS_ENABLE_ASM ) && \
+      ! ( defined( __powerpc) || \
+          defined(__powerpc__) || \
+          defined(__powerpc64__) || \
+          defined(__POWERPC__) || \
+          defined(__ppc__) || \
+          defined(__ppc64__) )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+
+  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
+  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_COMPILER_PGI )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_COMPILER_NVCC )
+
+  #if defined(__CUDA_ARCH__ )
+    #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* Select compiler dependent interface for atomics */
+
+#if ! defined( KOKKOS_ATOMICS_USE_CUDA ) || \
+    ! defined( KOKKOS_ATOMICS_USE_GNU ) || \
+    ! defined( KOKKOS_ATOMICS_USE_INTEL ) || \
+    ! defined( KOKKOS_ATOMICS_USE_OPENMP31 )
+
+/* Atomic selection is not pre-defined, choose from language and compiler. */
+
+#if defined( __CUDA_ARCH__ )
+
+  #define KOKKOS_ATOMICS_USE_CUDA
+
+#elif defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG )
+
+  #define KOKKOS_ATOMICS_USE_GNU
+
+#elif defined( KOKKOS_COMPILER_INTEL ) || defined( KOKKOS_COMPILER_CRAYC )
+
+  #define KOKKOS_ATOMICS_USE_INTEL
+
+#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
+
+  #define KOKKOS_ATOMICS_USE_OMP31
+
+#else
+
+  #error "Compiler does not support atomic operations"
+
+#endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#endif /* #ifndef KOKKOS_COMPILER_MACROS_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -0,0 +1,189 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <cctype>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+
+namespace Kokkos {
+
+  typedef Kokkos::DefaultExecutionSpace::host_mirror_device_type  DefaultHostMirrorDeviceType ;
+
+  enum { DefaultIsNotHostSpace = ! Impl::is_same< Kokkos::DefaultExecutionSpace , DefaultHostMirrorDeviceType >::value };
+
+  void initialize() {
+    if ( DefaultIsNotHostSpace ) {
+      Kokkos::DefaultExecutionSpace::host_mirror_device_type::initialize();
+    }
+    Kokkos::DefaultExecutionSpace::initialize();
+  }
+
+  void initialize(int narg, char* arg[]) {
+
+    int nthreads = -1;
+    int numa = -1;
+    int device = -1;
+
+    int iarg = 0;
+
+    while (iarg < narg) {
+      if (strcmp(arg[iarg],"--threads") == 0) {
+         if (iarg+2 > narg)
+            Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+         nthreads = atoi(arg[iarg+1]);
+         iarg+=2;
+      } else if (strcmp(arg[iarg],"--numa") == 0) {
+         if (iarg+2 > narg)
+            Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+         numa = atoi(arg[iarg+1]);
+         iarg+=2;
+      } else if (strcmp(arg[iarg],"--device") == 0) {
+         if (iarg+2 > narg)
+            Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+         device = atoi(arg[iarg+1]);
+         iarg+=2;
+      } else if (strcmp(arg[iarg],"--ngpus") == 0) {
+         if (iarg+2 > narg)
+            Impl::throw_runtime_exception("Error: expecting one or two integer numbers after command line argument '--ngpus'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+         int ngpu = atoi(arg[iarg+1]);
+         iarg += 2;
+
+         int skip_gpu = 9999;
+         if (iarg+2 < narg && isdigit(arg[iarg+2][0])) {
+           skip_gpu = atoi(arg[iarg+2]);
+           iarg++;
+         }
+
+         char *str;
+         if ((str = getenv("SLURM_LOCALID"))) {
+           int local_rank = atoi(str);
+           device = local_rank % ngpu;
+           if (device >= skip_gpu) device++;
+         }
+         if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
+           int local_rank = atoi(str);
+           device = local_rank % ngpu;
+           if (device >= skip_gpu) device++;
+         }
+         if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
+           int local_rank = atoi(str);
+           device = local_rank % ngpu;
+           if (device >= skip_gpu) device++;
+         }
+      } else if (strcmp(arg[iarg],"--help") == 0) {
+         std::cout << std::endl;
+         std::cout << "-------------------------------" << std::endl;
+         std::cout << "-Kokkos command line arguments-" << std::endl;
+         std::cout << "-------------------------------" << std::endl;
+         std::cout << std::endl;
+         std::cout << "--help               : print this message" << std::endl;
+         std::cout << "--threads INT        : specify total number of threads or" << std::endl;
+         std::cout << "                       number of threads per NUMA region if " << std::endl;
+         std::cout << "                       used in conjunction with '--numa' option. " << std::endl;
+         std::cout << "--numa INT           : specify number of NUMA regions used by process." << std::endl;
+         std::cout << "--device INT         : specify device id to be used by Kokkos. " << std::endl;
+         std::cout << "--ngpus INT [INT]    : used when running MPI jobs. Specify number of" << std::endl;
+         std::cout << "                       devices per node to be used. Process to device" << std::endl;
+         std::cout << "                       mapping happens by obtaining the local MPI rank" << std::endl;
+         std::cout << "                       and assigning devices round-robin. The optional" << std::endl;
+         std::cout << "                       second argument allows for an existing device" << std::endl;
+         std::cout << "                       to be ignored. This is most useful on workstations" << std::endl;
+         std::cout << "                       with multiple GPUs of which one is used to drive" << std::endl;
+         std::cout << "                       screen output." << std::endl;
+         std::cout << std::endl;
+         std::cout << "-------------------------------" << std::endl;
+         std::cout << std::endl;
+         iarg++;
+      } else
+      iarg++;
+    }
+
+
+    if(DefaultIsNotHostSpace) {
+      if(nthreads>0) {
+        if(numa>0)
+          DefaultHostMirrorDeviceType::initialize(nthreads,numa);
+        else
+          DefaultHostMirrorDeviceType::initialize(nthreads);
+      } else
+        DefaultHostMirrorDeviceType::initialize();
+    }
+
+    #ifdef KOKKOS_HAVE_CUDA
+    if(Impl::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Cuda>::value) {
+      if(device>-1)
+        Kokkos::Cuda::initialize(device);
+      else
+        Kokkos::Cuda::initialize();
+    } else
+    #endif
+    {
+      if(nthreads>0) {
+        if(numa>0)
+          Kokkos::DefaultExecutionSpace::initialize(nthreads,numa);
+        else
+          Kokkos::DefaultExecutionSpace::initialize(nthreads);
+      } else
+        Kokkos::DefaultExecutionSpace::initialize();
+    }
+  }
+
+  void finalize() {
+    if(DefaultIsNotHostSpace) {
+      DefaultHostMirrorDeviceType::finalize();
+    }
+    Kokkos::DefaultExecutionSpace::finalize();
+  }
+
+  void fence() {
+    if(DefaultIsNotHostSpace) {
+      DefaultHostMirrorDeviceType::fence();
+    }
+    Kokkos::DefaultExecutionSpace::fence();
+  }
+}
--- a/lib/kokkos/core/src/impl/Kokkos_CrsArray_factory.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_CrsArray_factory.hpp
@ -0,0 +1,223 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP
+#define KOKKOS_IMPL_CRSARRAY_FACTORY_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view )
+{
+  // Force copy:
+  //typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
+  typedef CrsArray< DataType , Arg1Type , Arg2Type , SizeType >  crsarray_type ;
+
+  typename crsarray_type::HostMirror               tmp ;
+  typename crsarray_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map );
+
+  tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const'
+  tmp.entries = create_mirror( view.entries );
+
+  // Deep copy:
+  deep_copy( tmp_row_map , view.row_map );
+  deep_copy( tmp.entries , view.entries );
+
+  return tmp ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return view ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return create_mirror( view );
+}
+
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class CrsArrayType , class InputSizeType >
+inline
+typename CrsArrayType::crsarray_type
+create_crsarray( const std::string & label ,
+                 const std::vector< InputSizeType > & input )
+{
+  typedef CrsArrayType                  output_type ;
+  //typedef std::vector< InputSizeType >  input_type ; // unused
+
+  typedef typename output_type::entries_type   entries_type ;
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::device_type > work_type ;
+
+  output_type output ;
+
+  // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i];
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  return output ;
+}
+
+//----------------------------------------------------------------------------
+
+template< class CrsArrayType , class InputSizeType >
+inline
+typename CrsArrayType::crsarray_type
+create_crsarray( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input )
+{
+  typedef CrsArrayType                                output_type ;
+  //typedef std::vector< std::vector< InputSizeType > > input_type ; // unused
+  typedef typename output_type::entries_type          entries_type ;
+  //typedef typename output_type::size_type             size_type ; // unused
+
+  // mfh 14 Feb 2014: This function doesn't actually create instances
+  // of ok_rank, but it needs to declare the typedef in order to do
+  // the static "assert" (a compile-time check that the given shape
+  // has rank 1).  In order to avoid a "declared but unused typedef"
+  // warning, we declare an empty instance of this type, with the
+  // usual "(void)" marker to avoid a compiler warning for the unused
+  // variable.
+
+  typedef typename
+    Impl::assert_shape_is_rank_one< typename entries_type::shape_type >::type
+      ok_rank ;
+  {
+    ok_rank thing;
+    (void) thing;
+  }
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::device_type > work_type ;
+
+  output_type output ;
+
+    // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i].size();
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  // Fill in the entries:
+  {
+    typename entries_type::HostMirror host_entries =
+      create_mirror_view( output.entries );
+
+    size_t sum = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) {
+        host_entries( sum ) = input[i][j] ;
+      }
+    }
+
+    deep_copy( output.entries , host_entries );
+  }
+
+  return output ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
@ -0,0 +1,184 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <ostream>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include <impl/Kokkos_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void throw_runtime_exception( const std::string & msg )
+{
+  std::ostringstream o ;
+  o << msg ;
+  traceback_callstack( o );
+  throw std::runtime_error( o.str() );
+}
+
+
+std::string human_memory_size(size_t arg_bytes)
+{
+  double bytes = arg_bytes;
+  const double K = 1024;
+  const double M = K*1024;
+  const double G = M*1024;
+
+  std::ostringstream out;
+  if (bytes < K) {
+    out << std::setprecision(4) << bytes << " B";
+  } else if (bytes < M) {
+    bytes /= K;
+    out << std::setprecision(4) << bytes << " K";
+  } else if (bytes < G) {
+    bytes /= M;
+    out << std::setprecision(4) << bytes << " M";
+  } else {
+    bytes /= G;
+    out << std::setprecision(4) << bytes << " G";
+  }
+  return out.str();
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __GNUC__ ) && defined( ENABLE_TRACEBACK )
+
+/*  This is only known to work with GNU C++
+ *  Must be compiled with '-rdynamic'
+ *  Must be linked with   '-ldl'
+ */
+
+/* Print call stack into an error stream,
+ * so one knows in which function the error occured.
+ *
+ * Code copied from:
+ *   http://stupefydeveloper.blogspot.com/2008/10/cc-call-stack.html
+ *
+ * License on this site:
+ *   This blog is licensed under a
+ *   Creative Commons Attribution-Share Alike 3.0 Unported License.
+ *
+ *   http://creativecommons.org/licenses/by-sa/3.0/
+ *
+ * Modified to output to std::ostream.
+ */
+#include <signal.h>
+#include <execinfo.h>
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  using namespace abi;
+
+  enum { MAX_DEPTH = 32 };
+
+  void *trace[MAX_DEPTH];
+  Dl_info dlinfo;
+
+  int status;
+
+  int trace_size = backtrace(trace, MAX_DEPTH);
+
+  msg << std::endl << "Call stack {" << std::endl ;
+
+  for (int i=1; i<trace_size; ++i)
+  {
+    if(!dladdr(trace[i], &dlinfo))
+        continue;
+
+    const char * symname = dlinfo.dli_sname;
+
+    char * demangled = __cxa_demangle(symname, NULL, 0, &status);
+
+    if ( status == 0 && demangled ) {
+      symname = demangled;
+    }
+
+    if ( symname && *symname != 0 ) {
+      msg << "  object: " << dlinfo.dli_fname
+          << " function: " << symname
+          << std::endl ;
+    }
+
+    if ( demangled ) {
+        free(demangled);
+    }
+  }
+  msg << "}" ;
+}
+
+}
+}
+
+#else
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  msg << std::endl << "Traceback functionality not available" << std::endl ;
+}
+
+}
+}
+
+#endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ERROR_HPP
+#define KOKKOS_IMPL_ERROR_HPP
+
+#include <string>
+#include <iosfwd>
+
+namespace Kokkos {
+namespace Impl {
+
+void throw_runtime_exception( const std::string & );
+
+void traceback_callstack( std::ostream & );
+
+std::string human_memory_size(size_t arg_bytes);
+
+}
+}
+
+#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@ -0,0 +1,290 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <memory.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+
+class HostMemoryTrackingEntry : public Impl::MemoryTrackingEntry
+{
+public:
+
+  void * const ptr_alloc ;
+
+  HostMemoryTrackingEntry( const std::string & arg_label ,
+                           const std::type_info & arg_info ,
+                           void * const           arg_ptr ,
+                           const size_t           arg_size )
+    : Impl::MemoryTrackingEntry( arg_label , arg_info , arg_ptr , arg_size )
+    , ptr_alloc( arg_ptr )
+    {}
+
+  ~HostMemoryTrackingEntry();
+};
+
+HostMemoryTrackingEntry::~HostMemoryTrackingEntry()
+{
+#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
+   _mm_free( ptr_alloc );
+#else
+   free( ptr_alloc );
+#endif
+}
+
+Impl::MemoryTracking & host_space_singleton()
+{
+  static Impl::MemoryTracking self("Kokkos::HostSpace");
+  return self ;
+}
+
+bool host_space_verify_modifiable( const char * const label )
+{
+  static const char error_in_parallel[] = "Called with HostSpace::in_parallel()" ;
+  static const char error_not_exists[]  = "Called after return from main()" ;
+
+  const char * const error_msg =
+    HostSpace::in_parallel() ? error_in_parallel : (
+    ! host_space_singleton().exists() ? error_not_exists : (const char *) 0 );
+
+  if ( error_msg ) {
+    std::cerr << "Kokkos::HostSpace::" << label << " ERROR : " << error_msg << std::endl ;
+  }
+
+  return error_msg == 0  ;
+}
+
+} // namespace <blank>
+} // namespade Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+void * host_allocate_not_thread_safe(
+  const std::string    & label ,
+  const std::type_info & scalar_type ,
+  const size_t           scalar_size ,
+  const size_t           scalar_count )
+{
+  void * ptr = 0 ;
+
+  if ( 0 < scalar_size && 0 < scalar_count ) {
+    void * ptr_alloc = 0 ;
+    size_t count_alloc = scalar_count ;
+
+#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
+
+    ptr = ptr_alloc = _mm_malloc( scalar_size * count_alloc , MEMORY_ALIGNMENT );
+   
+#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
+      ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
+
+    posix_memalign( & ptr_alloc , MEMORY_ALIGNMENT , scalar_size * count_alloc );
+    ptr = ptr_alloc ;
+
+#else
+
+    // Over-allocate to guarantee enough aligned space.
+
+    count_alloc += ( MEMORY_ALIGNMENT + scalar_size - 1 ) / scalar_size ;
+
+    ptr_alloc = malloc( scalar_size * count_alloc );
+
+    ptr = static_cast<unsigned char *>(ptr_alloc) + 
+          ( MEMORY_ALIGNMENT - reinterpret_cast<ptrdiff_t>(ptr_alloc) % MEMORY_ALIGNMENT );
+
+#endif
+
+    if ( ptr_alloc && ptr_alloc <= ptr &&
+         0 == ( reinterpret_cast<ptrdiff_t>(ptr) % MEMORY_ALIGNMENT ) ) {
+      host_space_singleton().insert(
+        new HostMemoryTrackingEntry( label , scalar_type , ptr_alloc , scalar_size * count_alloc ) );
+    }
+    else {
+      std::ostringstream msg ;
+      msg << "Kokkos::Impl::host_allocate_not_thread_safe( "
+          << label
+          << " , " << scalar_type.name()
+          << " , " << scalar_size
+          << " , " << scalar_count
+          << " ) FAILED aligned memory allocation" ;
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+
+  return ptr ;
+}
+
+void host_decrement_not_thread_safe( const void * ptr )
+{
+  host_space_singleton().decrement( ptr );
+}
+
+DeepCopy<HostSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
+{
+  memcpy( dst , src , n );
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace {
+
+static const int QUERY_DEVICE_IN_PARALLEL_MAX = 16 ;
+
+typedef int (* QueryDeviceInParallelPtr )();
+
+QueryDeviceInParallelPtr s_in_parallel_query[ QUERY_DEVICE_IN_PARALLEL_MAX ] ;
+int s_in_parallel_query_count = 0 ;
+
+} // namespace <empty>
+
+void HostSpace::register_in_parallel( int (*device_in_parallel)() )
+{
+  if ( 0 == device_in_parallel ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
+  }
+
+  int i = -1 ;
+
+  if ( ! (device_in_parallel)() ) {
+    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
+  }
+
+  if ( i < s_in_parallel_query_count ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
+
+  }
+
+  if ( QUERY_DEVICE_IN_PARALLEL_MAX <= i ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
+
+  }
+
+  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
+
+  if ( i == s_in_parallel_query_count ) {
+    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
+  }
+}
+
+int HostSpace::in_parallel()
+{
+  const int n = s_in_parallel_query_count ;
+
+  int i = 0 ;
+
+  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
+
+  return i < n ;
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+void * HostSpace::allocate(
+  const std::string    & label ,
+  const std::type_info & scalar_type ,
+  const size_t           scalar_size ,
+  const size_t           scalar_count )
+{
+  void * ptr = 0 ;
+
+  if ( host_space_verify_modifiable("allocate") ) {
+    ptr = Impl::host_allocate_not_thread_safe( label , scalar_type , scalar_size , scalar_count );
+  }
+
+  return ptr ;
+}
+
+void HostSpace::increment( const void * ptr )
+{
+  if ( host_space_verify_modifiable("increment") ) {
+    host_space_singleton().increment( ptr );
+  }
+}
+
+void HostSpace::decrement( const void * ptr )
+{
+  if ( host_space_verify_modifiable("decrement") ) {
+    Impl::host_decrement_not_thread_safe( ptr );
+  }
+}
+
+void HostSpace::print_memory_view( std::ostream & o )
+{
+  host_space_singleton().print( o , std::string("  ") );
+}
+
+std::string HostSpace::query_label( const void * p )
+{
+  const Impl::MemoryTrackingEntry * const info = 
+    host_space_singleton().query( p );
+
+  return 0 != info ? info->label : std::string("ERROR NOT DEFINED");
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
--- a/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.cpp
@ -0,0 +1,285 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stddef.h>
+#include <limits>
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_MemoryTracking.hpp>
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+//----------------------------------------------------------------------------
+// Fast search for result[-1] <= val < result[0].
+// Requires result[max] == upper_bound.
+// Start with a binary search until the search range is
+// less than LINEAR_LIMIT, then switch to linear search.
+
+int upper_bound( const ptrdiff_t * const begin , unsigned length ,
+                 const ptrdiff_t val )
+{
+  enum { LINEAR_LIMIT = 32 };
+
+  // precondition: begin[length-1] == std::numeric_limits<ptrdiff_t>::max()
+
+  const ptrdiff_t * first = begin ;
+
+  while ( LINEAR_LIMIT < length ) {
+    unsigned          half   = length >> 1 ;
+    const ptrdiff_t * middle = first + half ;
+
+    if ( val < *middle ) {
+      length = half ;
+    }
+    else {
+      first   = ++middle ;
+      length -= ++half ;
+    }
+  }
+
+  for ( ; ! ( val < *first ) ; ++first ) {}
+
+  return first - begin ;
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+MemoryTracking::MemoryTracking( const std::string & space )
+  : m_space( space ), m_tracking(), m_tracking_end()
+{
+  ptrdiff_t max = std::numeric_limits<ptrdiff_t>::max();
+  void * const ptr = reinterpret_cast<void*>( max );
+
+  m_tracking.reserve(64);
+  m_tracking_end.reserve(64);
+
+  // Sentinal value of end
+
+  m_tracking.push_back( new MemoryTrackingEntry( "sentinal" , typeid(void) , ptr , 0 ) );
+  m_tracking_end.push_back( max );
+}
+
+MemoryTracking::~MemoryTracking()
+{
+  const ptrdiff_t max = std::numeric_limits<ptrdiff_t>::max();
+
+  try {
+    if ( 1 < m_tracking.size() ) {
+      std::cerr << m_space << " destroyed with memory leaks:" << std::endl ;
+      print( std::cerr , std::string("  ") );
+    }
+    else if ( 1 != m_tracking_end.size() || m_tracking_end.back() != max ) {
+      std::cerr << m_space << " corrupted data structure" << std::endl ;
+    }
+
+    // Deallocate memory within the try-catch block:
+    m_space        = std::string();
+    m_tracking     = std::vector<MemoryTrackingEntry*>();
+    m_tracking_end = std::vector<ptrdiff_t>();
+
+  } catch( ... ) {}
+}
+
+void MemoryTracking::insert( MemoryTrackingEntry * entry )
+{
+  const ptrdiff_t max = std::numeric_limits<ptrdiff_t>::max();
+
+  const bool ok_exists = ! m_tracking_end.empty();
+
+  const bool ok_range = entry &&
+                        0 < entry->begin &&
+                            entry->begin < entry->end &&
+                                           entry->end < max ;
+
+  int i = -1 ;
+
+  if ( ok_exists && ok_range ) {
+
+    i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , entry->begin );
+
+    // Guaranteed:
+    //   a) entry->begin < m_tracking_end[i]
+    //   b) i == 0 || m_tracking_end[i-1] <= entry->begin
+
+    if ( entry->end <= m_tracking[i]->begin ) {
+
+      // Non-overlapping range:
+      // m_tracking[i-1].end <= entry->begin < entry->end <= m_tracking[i].begin
+
+      entry->m_count = 1 ;
+
+      m_tracking.insert(     m_tracking.begin() + i , entry );
+      m_tracking_end.insert( m_tracking_end.begin() + i , entry->end );
+    }
+  }
+
+  if ( ! ok_exists || ! ok_range || -1 == i ) {
+    std::ostringstream msg ;
+    msg << "MemoryTracking(" << m_space << ")::insert( " ;
+    entry->print( msg );
+    msg << " ) ERROR: " ;
+
+    if ( ! ok_range ) {
+      msg << "Invalid memory range" ;
+    }
+    else {
+      msg << "Overlapping memory range with " ;
+      m_tracking[i]->print( msg );
+    }
+    msg << " )" ;
+    throw_runtime_exception( msg.str() );
+  }
+}
+
+void MemoryTracking::increment( const void * ptr )
+{
+  if ( ptr ) {
+    const ptrdiff_t p = reinterpret_cast<ptrdiff_t>( ptr );
+
+    bool error = m_tracking_end.empty();
+
+    if ( ! error ) {
+
+      const int i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , p );
+
+      error = p < m_tracking[i]->begin ;
+
+      if ( ! error ) {
+        ++( m_tracking[i]->m_count );
+      }
+    }
+
+    if ( error ) {
+      std::ostringstream msg ;
+      msg << "MemoryTracking(" << m_space
+          << ")::increment( " << p << " ) ERROR: Not being tracked" ;
+      throw_runtime_exception( msg.str() );
+    }
+  }
+}
+
+void MemoryTracking::decrement( const void * ptr )
+{
+  if ( ptr ) {
+    const ptrdiff_t p = reinterpret_cast<ptrdiff_t>( ptr );
+
+    bool error = m_tracking_end.empty();
+
+    if ( ! error ) {
+
+      const int i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , p );
+
+      error = p < m_tracking[i]->begin ;
+
+      if ( ! error && ( 0 == --( m_tracking[i]->m_count ) ) ) {
+        delete m_tracking[i] ;
+
+        m_tracking.erase(     m_tracking.begin() + i );
+        m_tracking_end.erase( m_tracking_end.begin() + i );
+      }
+    }
+
+    if ( error ) {
+      std::ostringstream msg ;
+      msg << "MemoryTracking(" << m_space
+          << ")::decrement( " << p << " ) ERROR: Not being tracked" 
+          << std::endl ;
+      std::cerr << msg.str();
+    }
+  }
+}
+
+MemoryTrackingEntry *
+MemoryTracking::query( const void * ptr ) const
+{
+  MemoryTrackingEntry * result = 0 ;
+
+  if ( ptr && ! m_tracking_end.empty() ) {
+    const ptrdiff_t p = reinterpret_cast<ptrdiff_t>( ptr );
+
+    const int i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , p );
+
+    if ( m_tracking[i]->begin <= p ) result = m_tracking[i] ;
+  }
+
+  return result ;
+}
+
+void MemoryTracking::print( std::ostream & s , const std::string & lead ) const
+{
+  // Don't print the sentinal value:
+  const size_t n = m_tracking.empty() ? 0 : m_tracking.size() - 1 ;
+
+  for ( size_t i = 0 ; i < n ; ++i ) {
+    s << lead ;
+    m_tracking[i]->print( s );
+    s << std::endl ;
+  }
+}
+
+MemoryTrackingEntry::~MemoryTrackingEntry()
+{}
+
+void MemoryTrackingEntry::print( std::ostream & s ) const
+{
+  s << "{ "
+    << "label("  << label << ") "
+    << "typeid(" << type.name() << ") "
+    << "range[ " << ((void*)begin) << " : " << ((void*)end) << " ) "
+    << "count("  << m_count << ") }" ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.hpp
@ -0,0 +1,151 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MEMORY_TRACKING_HPP
+#define KOKKOS_MEMORY_TRACKING_HPP
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+#include <string>
+#include <typeinfo>
+#include <iosfwd>
+
+namespace Kokkos {
+namespace Impl {
+
+class MemoryTracking ;
+
+class MemoryTrackingEntry {
+public:
+  const std::string      label ;
+  const std::type_info & type ;
+  const ptrdiff_t        begin ;
+  const ptrdiff_t        end ;
+private:
+  unsigned m_count ;
+protected:
+
+  MemoryTrackingEntry( const std::string    & arg_label ,
+                       const std::type_info & arg_type ,
+                       const void * const     arg_begin ,
+                       const ptrdiff_t        arg_bytes )
+    : label( arg_label )
+    , type(  arg_type )
+    , begin( reinterpret_cast<ptrdiff_t>( arg_begin ) )
+    , end(   reinterpret_cast<ptrdiff_t>(
+               reinterpret_cast<const unsigned char *>( arg_begin ) + arg_bytes ) )
+    , m_count( 0 )
+    {}
+
+public:
+
+  unsigned count() const { return m_count ; }
+
+  virtual void print( std::ostream & ) const ;
+
+  virtual ~MemoryTrackingEntry();
+
+private:
+
+  MemoryTrackingEntry();
+  MemoryTrackingEntry( const MemoryTrackingEntry & rhs );
+  MemoryTrackingEntry & operator = ( const MemoryTrackingEntry & rhs );
+
+  friend class MemoryTracking ;
+};
+
+
+class MemoryTracking {
+public:
+
+  /** \brief  Track a memory range defined by the entry.
+   *          This entry must be allocated via 'new'.
+   */
+  void insert( MemoryTrackingEntry * entry );
+
+  /** \brief  Decrement the tracked memory range.
+   *          If the count is zero then the entry is deleted
+   *          via the 'delete' operator.
+   */
+  void decrement( const void * ptr );
+
+  /** \brief  Increment the tracking count.  */
+  void increment( const void * ptr );
+
+  /** \brief  Query a tracked memory range. */
+  MemoryTrackingEntry * query( const void * ptr ) const ;
+
+  /** \brief  Call the 'print' method on all entries. */
+  void print( std::ostream & , const std::string & lead ) const ;
+
+  size_t size() const { return m_tracking.size(); }
+
+  template< typename iType >
+  MemoryTracking & operator[]( const iType & i ) const
+    { return *m_tracking[i]; }
+
+  /** \brief Construct with a name for error messages */
+  explicit MemoryTracking( const std::string & space );
+
+  /** \brief  Print memory leak warning for all entries. */
+  ~MemoryTracking();
+
+  /** \brief Query if constructed */
+  bool exists() const { return ! m_tracking_end.empty(); }
+
+private:
+  MemoryTracking();
+  MemoryTracking( const MemoryTracking & );
+  MemoryTracking & operator = ( const MemoryTracking & );
+
+  std::string                        m_space ;
+  std::vector<MemoryTrackingEntry*>  m_tracking ;
+  std::vector<ptrdiff_t>             m_tracking_end ;
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@ -0,0 +1,72 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE )
+#define KOKKOS_MEMORY_FENCE
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+KOKKOS_FORCEINLINE_FUNCTION
+void memory_fence()
+{
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+  __threadfence();
+#elif defined( KOKKOS_ATOMICS_USE_GCC )
+  __sync_synchronize();
+#elif defined( KOKKOS_ATOMICS_USE_INTEL )
+  _mm_mfence();
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+  #pragma omp flush
+
+#else
+ #error "Error: memory_fence() not defined"
+#endif
+}
+
+} // namespace kokkos
+
+#endif
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
@ -0,0 +1,84 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
+#define KOKKOS_PHYSICAL_LAYOUT_HPP
+
+
+#include <Kokkos_View.hpp>
+namespace Kokkos {
+namespace Impl {
+
+
+
+struct PhysicalLayout {
+  enum LayoutType {Left,Right,Scalar,Error};
+  LayoutType layout_type;
+  int rank;
+  long long int stride[8]; //distance between two neighboring elements in a given dimension
+
+  template< class T , class L , class D , class M >
+  PhysicalLayout( const View<T,L,D,M,ViewDefault> & view )
+    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
+                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
+    , rank( view.Rank )
+    {
+      for(int i=0;i<8;i++) stride[i] = 0;
+      view.stride( stride );
+    }
+  #ifdef KOKKOS_HAVE_CUDA
+  template< class T , class L , class D , class M >
+  PhysicalLayout( const View<T,L,D,M,ViewCudaTexture> & view )
+    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
+                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
+    , rank( view.Rank )
+    {
+      for(int i=0;i<8;i++) stride[i] = 0;
+      view.stride( stride );
+    }
+  #endif
+};
+
+}
+}
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@ -0,0 +1,123 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <sstream>
+#include <Kokkos_Serial.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+struct Sentinel {
+
+  void *   m_scratch ;
+  unsigned m_reduce_end ;
+  unsigned m_shared_end ;
+
+  Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
+
+  ~Sentinel()
+    {
+      if ( m_scratch ) { free( m_scratch ); }
+      m_scratch = 0 ;
+      m_reduce_end = 0 ;
+      m_shared_end = 0 ;
+    }
+
+  static Sentinel & singleton();
+};
+
+Sentinel & Sentinel::singleton()
+{
+  static Sentinel s ; return s ;
+}
+
+inline
+unsigned align( unsigned n )
+{
+  enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
+  return ( n + MASK ) & ~MASK ;
+}
+
+} // namespace
+
+SerialTeamMember::SerialTeamMember( int arg_league_rank
+                                  , int arg_league_size
+                                  , int arg_shared_size
+                                  )
+  : m_space( ((char *) Sentinel::singleton().m_scratch) + Sentinel::singleton().m_reduce_end
+           , arg_shared_size )
+  , m_league_rank( arg_league_rank )
+  , m_league_size( arg_league_size )
+{}
+
+} // namespace Impl
+
+void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
+{
+  static Impl::Sentinel & s = Impl::Sentinel::singleton();
+
+  reduce_size = Impl::align( reduce_size );
+  shared_size = Impl::align( shared_size );
+
+  if ( ( s.m_reduce_end < reduce_size ) ||
+       ( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
+
+    if ( s.m_scratch ) { free( s.m_scratch ); }
+  
+    if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
+    if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
+
+    s.m_scratch = malloc( s.m_shared_end );
+  }
+
+  return s.m_scratch ;
+}
+
+} // namespace Kokkos
+
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <stdlib.h>
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <impl/Kokkos_Serial_TaskPolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+typedef TaskMember<  Kokkos::Serial > Task ;
+typedef TaskManager< Kokkos::Serial > Mgr ;
+
+Mgr s_task_manager ;
+
+Mgr::TaskManager()
+  : m_ready(0)
+  , m_denied( reinterpret_cast<Task*>( ~((unsigned long)0) ) )
+{}
+
+void Mgr::assign( Task ** const lhs , Task * const rhs )
+{
+  if ( *lhs ) {
+    const int count = --((**lhs).m_ref_count);
+
+    if ( 0 == count ) {
+      // Reference count at zero, delete it
+
+     // Should only be deallocating a completed task
+      if ( (**lhs).m_state != Task::STATE_COMPLETE ) {
+        throw std::runtime_error(
+          std::string("Kokkos::Impl::TaskManager<Kokkos::Serial>::decrement ERROR: not STATE_COMPLETE") );
+      }
+
+      // A completed task should not have dependences...
+      if ( (**lhs).m_state == Task::STATE_COMPLETE ) {
+        for ( int i = 0 ; i < MAX_DEPENDENCE ; ++i ) {
+          if ( (**lhs).m_dep[i] ) {
+            throw std::runtime_error(
+              std::string("Kokkos::Impl::TaskManager<Kokkos::Serial>::decrement ERROR: STATE_COMPLETE has dependences") );
+          }
+        }
+      }
+
+      // Get deletion function and apply it
+      const Task::function_type d = (**lhs).m_dealloc ;
+
+      (*d)( *lhs );
+    }
+    else if ( count <= 0 ) {
+      throw std::runtime_error(std::string("Kokkos::Impl::TaskManager<Kokkos::Serial>::assign ERROR: reference counting") );
+    }
+  }
+
+  if ( rhs ) { ++( rhs->m_ref_count ); }
+
+  *lhs = rhs ;
+}
+
+void Mgr::verify_set_dependence( Task * t , int n )
+{
+  // Must be either constructing for original spawn or executing for a respawn.
+
+  if ( Task::STATE_CONSTRUCTING != t->m_state &&
+       Task::STATE_EXECUTING    != t->m_state ) {
+    throw std::runtime_error(std::string("Kokkos::Impl::TaskManager<Kokkos::Serial> spawn or respawn state error"));
+  }
+
+  if ( MAX_DEPENDENCE <= n ) {
+    throw std::runtime_error(std::string("Kokkos::Impl::TaskManager<Kokkos::Serial> spawn or respawn dependence count error"));
+  }
+}
+
+void Mgr::schedule( Task * t )
+{
+  // Must not be in a dependence linked list:  0 == t->m_next
+
+  if ( 0 != t->m_next ) {
+    throw std::runtime_error(std::string("Kokkos::Impl::Task spawn or respawn state error"));
+  }
+
+  // Is waiting for execution
+
+  t->m_state = Task::STATE_WAITING ;
+
+  // Insert this task into another dependence that is not complete
+
+  int i = 0 ;
+  for ( ; i < MAX_DEPENDENCE ; ++i ) {
+    Task * const y = t->m_dep[i] ;
+    if ( y && m_denied != ( t->m_next = y->m_wait ) ) {
+      y->m_wait = t ; // CAS( & y->m_wait , m_next , this );
+      break ;
+    }
+  }
+  if ( i == MAX_DEPENDENCE ) {
+    // All dependences are complete, insert into the ready list
+    t->m_next = m_ready ;
+    m_ready = t ; // CAS( & s_ready , m_next = s_ready , this );
+  }
+}
+
+void Mgr::wait( Task * )
+{
+  while ( m_ready ) {
+
+    // Remove this task from the ready list
+
+    // Task * task ;
+    // while ( ! CAS( & s_ready , task = s_ready , s_ready->m_next ) );
+
+    Task * const task = m_ready ;
+    m_ready = task->m_next ;
+
+    task->m_next = 0 ;
+
+    // precondition: task->m_state = STATE_WAITING
+    // precondition: task->m_dep[i]->m_state == STATE_COMPLETE  for all i
+    // precondition: does not exist T such that T->m_wait = task
+    // precondition: does not exist T such that T->m_next = task
+
+    task->m_state = Task::STATE_EXECUTING ;
+
+    (*task->m_apply)( task );
+
+    if ( task->m_state == Task::STATE_EXECUTING ) {
+      // task did not respawn itself
+      task->m_state = Task::STATE_COMPLETE ;
+
+      // release dependences:
+      for ( int i = 0 ; i < MAX_DEPENDENCE ; ++i ) {
+        assign( & task->m_dep[i] , 0 );
+      }
+
+      // Stop other tasks from adding themselves to 'task->m_wait' ;
+
+      Task * x ;
+      // CAS( & task->m_wait , x = task->m_wait , s_denied );
+      x = task->m_wait ; task->m_wait = (Task*) m_denied ;
+
+      // update tasks waiting on this task
+      while ( x ) {
+        Task * const next = x->m_next ;
+
+        x->m_next = 0 ;
+
+        schedule( x );
+
+        x = next ;
+      }
+    }
+  }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+TaskPolicy< Kokkos::Serial >::TaskPolicy()
+  : m_task_manager( Impl::s_task_manager )
+{}
+
+} // namespace Kokkos
+
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
@ -0,0 +1,648 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_SERIAL_TASKPOLICY_HPP
+#define KOKKOS_SERIAL_TASKPOLICY_HPP
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+#include <Kokkos_Serial.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_View.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskMember< Kokkos::Serial , void , void >
+{
+public:
+
+  friend class TaskManager< Kokkos::Serial > ;
+
+  enum { MAX_DEPENDENCE = 13 };
+
+  /**\brief  States of a task */
+  enum { STATE_CONSTRUCTING = 0 , STATE_WAITING = 1 , STATE_EXECUTING = 2 , STATE_COMPLETE = 4 };
+
+  /**\brief  Base dependence count when a task is allocated.
+   *         A separate dependence array is allocated when the number
+   *         of dependences exceeds this count.
+   */
+
+  typedef void (* function_type)( TaskMember * );
+
+  const std::type_info & m_typeid ;
+  const function_type    m_dealloc ;
+  const function_type    m_apply ;
+
+private:
+
+  int            m_state ;
+  int            m_ref_count ; ///< Reference count
+  TaskMember   * m_wait ;      ///< Linked list of tasks waiting on this task.
+  TaskMember   * m_next ;      ///< This task is a member of a linked list of
+                               ///< tasks waiting on another task.
+  TaskMember   * m_dep[ MAX_DEPENDENCE ]; ///< Dependences of this task
+
+  TaskMember( const TaskMember & );
+  TaskMember & operator = ( const TaskMember & );
+
+protected :
+
+  inline
+  TaskMember( const function_type    arg_dealloc
+            , const function_type    arg_apply
+            , const std::type_info & arg_type = typeid(void)
+            )
+    : m_typeid(  arg_type )
+    , m_dealloc( arg_dealloc )
+    , m_apply(   arg_apply )
+    , m_state( STATE_CONSTRUCTING )
+    , m_ref_count(0)
+    , m_wait(0)
+    , m_next(0)
+    { for ( int i = 0 ; i < MAX_DEPENDENCE ; ++i ) m_dep[i] = 0 ; }
+
+public:
+
+  template < class DerivedTaskMember >
+  static
+  void deallocate( TaskMember * t )
+    { delete static_cast< DerivedTaskMember * >( t ); }
+
+  inline static
+  TaskMember * verify_type( TaskMember * t ) { return t ; }
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  get_result_type get() const { return get_result_type() ; }
+
+  inline
+  TaskMember * get_dependence( int i ) const
+    { return ( STATE_EXECUTING == m_state && 0 <= i && i < MAX_DEPENDENCE ) ? m_dep[i] : (TaskMember*) 0 ; }
+
+  inline
+  int get_dependence() const
+    {
+      int i = 0 ;
+      if ( STATE_EXECUTING == m_state ) { for ( ; i < MAX_DEPENDENCE && m_dep[i] != 0 ; ++i ); }
+      return i ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskManager< Kokkos::Serial >
+{
+public:
+
+  typedef TaskMember< Kokkos::Serial > task_root_type ;
+
+  enum { MAX_DEPENDENCE = task_root_type::MAX_DEPENDENCE };
+
+  void schedule( task_root_type * );
+
+  static void verify_set_dependence( task_root_type * , int );
+
+  static void assign( task_root_type ** const , task_root_type * );
+
+  void wait( task_root_type * );
+
+  TaskManager();
+  TaskManager( const TaskManager & );
+  TaskManager & operator = ( const TaskManager & );
+
+
+  template< class A1 , class A2 >
+  void wait( const Future<A1,A2> & future ) { wait( future.m_task ); }
+
+  template< class A1 , class A2 >
+  void set_dependence( task_root_type * t
+                     , const Future<A1,A2> * const dep
+                     , typename Impl::enable_if
+                        < Impl::is_same< typename Future<A1,A2>::execution_space , Kokkos::Serial >::value
+                        , const int >::type n
+                      )
+    {
+      verify_set_dependence( t , n );
+      int i = 0 ;
+      for ( ; i < n ; ++i ) assign( & t->m_dep[i] , dep[i].m_task );
+      for ( ; i < MAX_DEPENDENCE ; ++i ) assign( & t->m_dep[i] , 0 );
+    }
+
+private:
+
+  task_root_type * m_ready ;
+  task_root_type * const m_denied ;
+
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template < class ResultType >
+class TaskMember< Kokkos::Serial , ResultType , void > : public TaskMember< Kokkos::Serial >
+{
+private:
+
+protected:
+
+  typedef TaskMember< Kokkos::Serial >::function_type  function_type ;
+
+  inline
+  TaskMember( const function_type    arg_dealloc
+            , const function_type    arg_apply
+            )
+    : TaskMember< Kokkos::Serial >( arg_dealloc , arg_apply , typeid(ResultType) )
+    , m_result()
+    {}
+
+public:
+
+  ResultType  m_result ;
+
+  inline static
+  TaskMember *
+  verify_type( TaskMember< Kokkos::Serial > * t )
+    {
+      if ( t != 0 && t->m_typeid != typeid(ResultType) ) {
+        throw std::runtime_error( std::string("Kokkos::Future bad cast for result type"));
+      }
+      return static_cast< TaskMember *>( t );
+    }
+
+  typedef const ResultType & get_result_type ;
+
+  inline
+  get_result_type get() const { return m_result ; }
+};
+
+//----------------------------------------------------------------------------
+
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Serial , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Serial , ResultType >
+  , public FunctorType
+{
+private:
+
+  typedef TaskMember< Kokkos::Serial >              member_root_type ;
+  typedef TaskMember< Kokkos::Serial , ResultType > member_base_type ;
+
+  static
+  void apply( member_root_type * t )
+    {
+      member_base_type * m = static_cast< member_base_type * >(t);
+      static_cast< TaskMember * >(m)->FunctorType::apply( m->m_result );
+    }
+
+protected:
+
+  inline 
+  TaskMember( const typename member_root_type::function_type  arg_dealloc
+            , const typename member_root_type::function_type  arg_apply
+            , const FunctorType &  arg_functor
+            )
+    : member_base_type( arg_dealloc , arg_apply )
+    , FunctorType( arg_functor )
+    {}
+
+public:
+
+  inline 
+  TaskMember( const FunctorType &  arg_functor )
+    : member_base_type( & member_root_type::template deallocate< TaskMember >
+                      , & TaskMember::apply )
+    , FunctorType( arg_functor )
+    {}
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType >
+class TaskMember< Kokkos::Serial , void , FunctorType >
+  : public TaskMember< Kokkos::Serial >
+  , public FunctorType
+{
+private:
+
+  typedef TaskMember< Kokkos::Serial > member_root_type ;
+
+  static
+  void apply( member_root_type * t )
+    { static_cast< TaskMember * >(t)->FunctorType::apply(); }
+
+protected:
+
+  inline 
+  TaskMember( const typename member_root_type::function_type  arg_dealloc
+            , const typename member_root_type::function_type  arg_apply
+            , const FunctorType &  arg_functor
+            )
+    : member_root_type( arg_dealloc , arg_apply )
+    , FunctorType( arg_functor )
+    {}
+
+public:
+
+  inline 
+  TaskMember( const FunctorType &  arg_functor )
+    : member_root_type( & member_root_type::template deallocate< TaskMember >
+                      , & TaskMember::apply )
+    , FunctorType( arg_functor )
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskPolicy< Impl::TaskDepends< Kokkos::Serial > >
+{
+public:
+
+  typedef Kokkos::Serial execution_space ;
+
+private:
+
+  enum { MAX_DEPENDENCE = Impl::TaskMember< execution_space >::MAX_DEPENDENCE };
+
+  Kokkos::Impl::TaskManager< execution_space >  & m_task_manager ;
+  Kokkos::Future< execution_space >               m_depends[ MAX_DEPENDENCE ];
+
+  TaskPolicy();
+  TaskPolicy & operator = ( const TaskPolicy & );
+
+public:
+
+  template< typename A1 , typename A2 >
+  TaskPolicy( Kokkos::Impl::TaskManager< execution_space > & manager
+            , const size_t n
+            , const Future< A1 , A2 > * const dep )
+    : m_task_manager( manager )
+    {
+      int i = 0 ;
+      for ( ; i < n ; ++i ) m_depends[i] = dep[i] ;
+      for ( ; i < MAX_DEPENDENCE ; ++i ) m_depends[i] = Future< execution_space >();
+    }
+
+  // Spawn a serial task:
+  template< class FunctorType , class ValueType >
+  Future< ValueType , execution_space >
+  spawn( const FunctorType & functor ) const
+    {
+      // Allocate a copy functor and insert into queue
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > member_type ;
+      member_type * m = new member_type( functor );
+      m_task_manager.set_dependence( m , m_depends );
+      m_task_manager.schedule( m );
+      return Future< ValueType , execution_space >( m );
+    }
+
+  // Construct a task policy for foreach-range tasks:
+  // spawn( task_policy.depends(N,d).foreach(RangePolicy) , functor );
+  // spawn( task_policy.foreach(RangePolicy) , functor );
+  template< class ExecPolicy >
+  TaskPolicy< Impl::TaskForEach< ExecPolicy > >
+  foreach( const ExecPolicy & arg_policy )
+    { return TaskPolicy< Impl::TaskForEach< ExecPolicy > >( m_task_manager , arg_policy , m_depends ); }
+
+  // Construct a task policy for reduce-range tasks:
+  template< class ExecPolicy >
+  TaskPolicy< Impl::TaskForEach< ExecPolicy > >
+  reduce( const ExecPolicy & arg_policy )
+    { return TaskPolicy< Impl::TaskReduce< ExecPolicy > >( m_task_manager , arg_policy , m_depends ); }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskPolicy< Kokkos::Serial >
+{
+public:
+
+  typedef Kokkos::Serial execution_space ;
+
+private:
+
+  typedef Impl::TaskMember< execution_space , void , void > task_base_type ;
+
+  Kokkos::Impl::TaskManager< execution_space > & m_task_manager ;
+
+  template< class FunctorType >
+  static
+  void apply( task_base_type * t )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType >  member_type ;
+      static_cast< member_type * >(t)->FunctorType::apply();
+    }
+
+  TaskPolicy & operator = ( const TaskPolicy & );
+
+public:
+
+  TaskPolicy();
+  TaskPolicy( const TaskPolicy & rhs )
+    : m_task_manager( rhs.m_task_manager ) {}
+
+  // Requires:
+  // class DerivedMemberType : public TaskMember< execution_space , typename FunctorType::value_type , FunctorType > ...
+  template< class FunctorType >
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      typedef const Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType >  member_type ;
+      return Future<void,execution_space>( static_cast< member_type * >(task_functor)->task_base_type::get_dependence(i) );
+    }
+
+  template< class FunctorType >
+  int get_dependence( const FunctorType * task_functor ) const
+    {
+      typedef const Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType >  member_type ;
+      return static_cast< member_type * >(task_functor)->task_base_type::get_dependence();
+    }
+
+  template< class A1 , class A2 >
+  void wait( const Future<A1,A2> & future ) const { m_task_manager.wait( future ); }
+
+  template< class FunctorType , class A1 , class A2 >
+  void respawn( FunctorType * task_functor
+              , const Future<A1,A2> * const dep
+              , typename Impl::enable_if
+                  < Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                  , const int
+                  >::type n
+              ) const
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType >  member_type ;
+      m_task_manager.set_dependence( static_cast< member_type * >( task_functor ) , dep , n );
+      m_task_manager.schedule( static_cast< member_type * >( task_functor ) );
+    }
+
+  // Allocate a copy functor and insert into queue
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  spawn( const FunctorType & functor ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  member_type ;
+      member_type * m = new member_type( functor );
+      m_task_manager.schedule( m );
+      return Future< value_type , execution_space >( m );
+    }
+
+  // Construct a task policy with dependences:
+  // spawn( task_policy.depends(N,d) , functor );
+  template< class A1 , class A2 >
+  TaskPolicy< Impl::TaskDepends< execution_space > >
+  depends( const Future< A1 , A2 > * const d
+         , typename Impl::enable_if<
+             ( Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+             ), const int >::type n 
+         )
+    { return TaskPolicy< Impl::TaskDepends< execution_space > >( m_task_manager , n , d ); }
+
+  // Construct a task policy for foreach-range tasks:
+  // spawn( task_policy.depends(N,d).foreach(RangePolicy) , functor );
+  // spawn( task_policy.foreach(RangePolicy) , functor );
+  template< class ExecPolicy >
+  TaskPolicy< Impl::TaskForEach< ExecPolicy > >
+  foreach( const ExecPolicy & arg_policy )
+    { return TaskPolicy< Impl::TaskForEach< ExecPolicy > >( m_task_manager , arg_policy ); }
+
+  // Construct a task policy for reduce-range tasks:
+  template< class ExecPolicy >
+  TaskPolicy< Impl::TaskReduce< ExecPolicy > >
+  reduce( const ExecPolicy & arg_policy )
+    { return TaskPolicy< Impl::TaskReduce< ExecPolicy > >( m_task_manager , arg_policy ); }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename IntType , unsigned P >
+class TaskPolicy< Impl::TaskForEach< Kokkos::RangePolicy< Kokkos::Serial , void , IntType , P >  >  >
+{
+public:
+
+  typedef Kokkos::Serial execution_space ;
+
+private:
+
+  typedef RangePolicy< execution_space , void , IntType , P > range_policy ;
+  typedef Impl::TaskManager< execution_space >  task_manager ;
+  typedef Impl::TaskMember<  execution_space >  task_root_type ;
+
+  task_manager & m_task_manager ;
+  range_policy   m_range_policy ;
+
+  // ForEach task
+  template< class FunctorType >
+  class member_type : public Impl::TaskMember< Kokkos::Serial , void , FunctorType >
+  {
+  private:
+
+    typedef Impl::TaskMember< Kokkos::Serial , void , FunctorType >    task_base_type ;
+
+    range_policy  m_policy ;
+
+    static
+    void apply( task_root_type * t )
+      {
+        range_policy const & r  = static_cast< member_type * >( static_cast< task_base_type * >( t ) )->m_policy ;
+        FunctorType        & f  = * static_cast< FunctorType * >( static_cast< task_base_type * >( t ) );
+        FunctorType  const & cf = f ;
+
+        const IntType e = r.end();
+        for ( IntType i = r.begin() ; i < e ; ++i ) { cf(i); }
+        f.apply();
+      }
+
+  public:
+
+    member_type( const FunctorType  & arg_func 
+               , const range_policy & arg_policy
+               )
+      : task_base_type( & task_base_type::template deallocate< member_type >
+                      , & member_type::apply
+                      , arg_func
+                      )
+      , m_policy( arg_policy )
+      {}
+  };
+
+
+  TaskPolicy();
+  TaskPolicy & operator = ( const TaskPolicy & );
+
+public:
+
+  TaskPolicy( task_manager & manager , const range_policy & policy )
+    : m_task_manager( manager )
+    , m_range_policy( policy )
+    {}
+
+  template< class FunctorType , class ValueType >
+  Future< ValueType , execution_space >
+  spawn( const FunctorType & functor ) const
+    {
+      typedef Future< ValueType , execution_space > future_type ;
+
+      // Allocate a copy functor and insert into queue
+
+      task_root_type * const t = new member_type< FunctorType >( functor , m_range_policy );
+
+      m_task_manager.schedule( t );
+
+      return future_type( t );
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename IntType , unsigned P >
+class TaskPolicy< Impl::TaskReduce< Kokkos::RangePolicy< Kokkos::Serial , void , IntType , P >  >  >
+{
+public:
+
+  typedef Kokkos::Serial execution_space ;
+
+private:
+
+  typedef RangePolicy< execution_space , void , IntType , P >  range_policy ;
+  typedef Impl::TaskManager< execution_space >  task_manager ;
+  typedef Impl::TaskMember<  execution_space >  task_root_type ;
+
+  task_manager & m_task_manager ;
+  range_policy   m_range_policy ;
+
+  // ForEach task
+  template< class FunctorType >
+  class member_type : public Impl::TaskMember< Kokkos::Serial , typename FunctorType::value_type , FunctorType >
+  {
+  private:
+    typedef typename FunctorType::value_type value_type ;
+
+    typedef Impl::TaskMember< Kokkos::Serial , value_type , FunctorType >    task_base_type ;
+    typedef Impl::TaskMember< Kokkos::Serial , value_type >    task_value_type ;
+
+    range_policy  m_policy ;
+
+    static
+    void apply( task_root_type * t )
+      {
+        task_base_type     & b  = * static_cast< task_base_type * >( t );
+        range_policy const & r  = static_cast< member_type & >( b ).m_policy ;
+        FunctorType        & f  = static_cast< FunctorType & >( b );
+        FunctorType  const & cf = f ;
+
+        cf.init( b.m_result );
+        const IntType e = r.end();
+        for ( IntType i = r.begin() ; i < e ; ++i ) { cf(i,b.m_result); }
+        f.apply( b.m_result );
+      }
+
+  public:
+
+    member_type( const FunctorType  & arg_func 
+               , const range_policy & arg_policy
+               )
+      : task_base_type( & task_base_type::template deallocate< member_type >
+                      , & member_type::apply
+                      , arg_func
+                      )
+      , m_policy( arg_policy )
+      {}
+  };
+
+  TaskPolicy();
+  TaskPolicy & operator = ( const TaskPolicy & );
+
+public:
+
+  TaskPolicy( task_manager & manager , const range_policy & policy )
+    : m_task_manager( manager )
+    , m_range_policy( policy )
+    {}
+
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  spawn( const FunctorType & functor ) const
+    {
+      typedef Future< typename FunctorType::value_type , execution_space > future_type ;
+
+      // Allocate a copy functor and insert into queue
+
+      task_root_type * const t = new member_type<FunctorType>( functor , m_range_policy );
+
+      m_task_manager.schedule( t );
+
+      return future_type( t );
+    }
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_SERIAL_TASK_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
@ -0,0 +1,178 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <sstream>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void assert_counts_are_equal_throw(
+  const size_t x_count ,
+  const size_t y_count )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_counts_are_equal_throw( "
+      << x_count << " != " << y_count << " )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void assert_shapes_are_equal_throw(
+  const unsigned x_scalar_size ,
+  const unsigned x_rank ,
+  const size_t   x_N0 , const unsigned x_N1 ,
+  const unsigned x_N2 , const unsigned x_N3 ,
+  const unsigned x_N4 , const unsigned x_N5 ,
+  const unsigned x_N6 , const unsigned x_N7 ,
+
+  const unsigned y_scalar_size ,
+  const unsigned y_rank ,
+  const size_t   y_N0 , const unsigned y_N1 ,
+  const unsigned y_N2 , const unsigned y_N3 ,
+  const unsigned y_N4 , const unsigned y_N5 ,
+  const unsigned y_N6 , const unsigned y_N7 )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_shape_are_equal_throw( {"
+      << " scalar_size(" << x_scalar_size
+      << ") rank(" << x_rank
+      << ") dimension(" ;
+  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
+  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
+  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
+  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
+  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
+  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
+  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
+  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
+  msg << " ) } != { "
+      << " scalar_size(" << y_scalar_size
+      << ") rank(" << y_rank
+      << ") dimension(" ;
+  if ( 0 < y_rank ) { msg << " " << y_N0 ; }
+  if ( 1 < y_rank ) { msg << " " << y_N1 ; }
+  if ( 2 < y_rank ) { msg << " " << y_N2 ; }
+  if ( 3 < y_rank ) { msg << " " << y_N3 ; }
+  if ( 4 < y_rank ) { msg << " " << y_N4 ; }
+  if ( 5 < y_rank ) { msg << " " << y_N5 ; }
+  if ( 6 < y_rank ) { msg << " " << y_N6 ; }
+  if ( 7 < y_rank ) { msg << " " << y_N7 ; }
+  msg << " ) } )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void AssertShapeBoundsAbort< Kokkos::HostSpace >::apply(
+  const size_t rank ,
+  const size_t n0 , const size_t n1 , 
+  const size_t n2 , const size_t n3 ,
+  const size_t n4 , const size_t n5 ,
+  const size_t n6 , const size_t n7 ,
+
+  const size_t arg_rank ,
+  const size_t i0 , const size_t i1 ,
+  const size_t i2 , const size_t i3 ,
+  const size_t i4 , const size_t i5 ,
+  const size_t i6 , const size_t i7 )
+{
+  std::ostringstream msg ;
+  msg << "Kokkos::Impl::AssertShapeBoundsAbort( shape = {" ;
+  if ( 0 < rank ) { msg << " " << n0 ; }
+  if ( 1 < rank ) { msg << " " << n1 ; }
+  if ( 2 < rank ) { msg << " " << n2 ; }
+  if ( 3 < rank ) { msg << " " << n3 ; }
+  if ( 4 < rank ) { msg << " " << n4 ; }
+  if ( 5 < rank ) { msg << " " << n5 ; }
+  if ( 6 < rank ) { msg << " " << n6 ; }
+  if ( 7 < rank ) { msg << " " << n7 ; }
+  msg << " } index = {" ;
+  if ( 0 < arg_rank ) { msg << " " << i0 ; }
+  if ( 1 < arg_rank ) { msg << " " << i1 ; }
+  if ( 2 < arg_rank ) { msg << " " << i2 ; }
+  if ( 3 < arg_rank ) { msg << " " << i3 ; }
+  if ( 4 < arg_rank ) { msg << " " << i4 ; }
+  if ( 5 < arg_rank ) { msg << " " << i5 ; }
+  if ( 6 < arg_rank ) { msg << " " << i6 ; }
+  if ( 7 < arg_rank ) { msg << " " << i7 ; }
+  msg << " } )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void assert_shape_effective_rank1_at_leastN_throw(
+  const size_t x_rank , const size_t x_N0 ,
+  const size_t x_N1 ,   const size_t x_N2 ,
+  const size_t x_N3 ,   const size_t x_N4 ,
+  const size_t x_N5 ,   const size_t x_N6 ,
+  const size_t x_N7 ,
+  const size_t N0 )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_shape_effective_rank1_at_leastN_throw( shape = {" ;
+  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
+  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
+  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
+  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
+  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
+  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
+  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
+  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
+  msg << " } N = " << N0 << " )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+
+
+}
+}
+
--- a/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
@ -0,0 +1,895 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SHAPE_HPP
+#define KOKKOS_SHAPE_HPP
+
+#include <typeinfo>
+#include <utility>
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  The shape of a Kokkos with dynamic and static dimensions.
+ *          Dynamic dimensions are member values and static dimensions are
+ *          'static const' values.
+ *
+ *  The upper bound on the array rank is eight.
+ */
+template< unsigned ScalarSize ,
+          unsigned Rank ,
+          unsigned s0  = 1 ,
+          unsigned s1  = 1 ,
+          unsigned s2  = 1 ,
+          unsigned s3  = 1 ,
+          unsigned s4  = 1 ,
+          unsigned s5  = 1 ,
+          unsigned s6  = 1 ,
+          unsigned s7  = 1 >
+struct Shape ;
+
+template< class ShapeType , class Layout >
+struct ShapeMap ;
+
+//----------------------------------------------------------------------------
+/** \brief  Shape equality if the value type, layout, and dimensions
+ *          are equal.
+ */
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  enum { same_size = xSize == ySize };
+  enum { same_rank = xRank == yRank };
+
+  return same_size && same_rank &&
+         size_t( x.N0 )   == size_t( y.N0 ) &&
+         unsigned( x.N1 ) == unsigned( y.N1 ) &&
+         unsigned( x.N2 ) == unsigned( y.N2 ) &&
+         unsigned( x.N3 ) == unsigned( y.N3 ) &&
+         unsigned( x.N4 ) == unsigned( y.N4 ) &&
+         unsigned( x.N5 ) == unsigned( y.N5 ) &&
+         unsigned( x.N6 ) == unsigned( y.N6 ) &&
+         unsigned( x.N7 ) == unsigned( y.N7 ) ;
+}
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize ,unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{ return ! operator == ( x , y ); }
+
+//----------------------------------------------------------------------------
+
+void assert_counts_are_equal_throw(
+  const size_t x_count ,
+  const size_t y_count );
+
+inline
+void assert_counts_are_equal(
+  const size_t x_count ,
+  const size_t y_count )
+{
+  if ( x_count != y_count ) {
+    assert_counts_are_equal_throw( x_count , y_count );
+  }
+}
+
+void assert_shapes_are_equal_throw(
+  const unsigned x_scalar_size ,
+  const unsigned x_rank ,
+  const size_t   x_N0 , const unsigned x_N1 ,
+  const unsigned x_N2 , const unsigned x_N3 ,
+  const unsigned x_N4 , const unsigned x_N5 ,
+  const unsigned x_N6 , const unsigned x_N7 ,
+
+  const unsigned y_scalar_size ,
+  const unsigned y_rank ,
+  const size_t   y_N0 , const unsigned y_N1 ,
+  const unsigned y_N2 , const unsigned y_N3 ,
+  const unsigned y_N4 , const unsigned y_N5 ,
+  const unsigned y_N6 , const unsigned y_N7 );
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+inline
+void assert_shapes_are_equal(
+  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
+  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
+
+  if ( x != y ) {
+    assert_shapes_are_equal_throw(
+      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
+      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
+  }
+}
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+void assert_shapes_equal_dimension(
+  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
+  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
+
+  // Omit comparison of scalar_size.
+  if ( unsigned( x.rank ) != unsigned( y.rank ) ||
+       size_t( x.N0 )   != size_t( y.N0 ) || 
+       unsigned( x.N1 ) != unsigned( y.N1 ) || 
+       unsigned( x.N2 ) != unsigned( y.N2 ) || 
+       unsigned( x.N3 ) != unsigned( y.N3 ) ||
+       unsigned( x.N4 ) != unsigned( y.N4 ) || 
+       unsigned( x.N5 ) != unsigned( y.N5 ) || 
+       unsigned( x.N6 ) != unsigned( y.N6 ) || 
+       unsigned( x.N7 ) != unsigned( y.N7 ) ) {
+    assert_shapes_are_equal_throw(
+      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
+      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class ShapeType > struct assert_shape_is_rank_zero ;
+template< class ShapeType > struct assert_shape_is_rank_one ;
+
+template< unsigned Size >
+struct assert_shape_is_rank_zero< Shape<Size,0> >
+  : public true_type {};
+
+template< unsigned Size , unsigned s0 >
+struct assert_shape_is_rank_one< Shape<Size,1,s0> >
+  : public true_type {};
+
+//----------------------------------------------------------------------------
+
+/** \brief  Array bounds assertion templated on the execution space
+ *          to allow device-specific abort code.
+ */
+template< class Space >
+struct AssertShapeBoundsAbort ;
+
+template<>
+struct AssertShapeBoundsAbort< Kokkos::HostSpace >
+{
+  static void apply( const size_t rank ,
+                     const size_t n0 , const size_t n1 ,
+                     const size_t n2 , const size_t n3 ,
+                     const size_t n4 , const size_t n5 ,
+                     const size_t n6 , const size_t n7 ,
+                     const size_t arg_rank ,
+                     const size_t i0 , const size_t i1 ,
+                     const size_t i2 , const size_t i3 ,
+                     const size_t i4 , const size_t i5 ,
+                     const size_t i6 , const size_t i7 );
+};
+
+template< class ExecutionDevice >
+struct AssertShapeBoundsAbort
+{
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t rank ,
+                     const size_t n0 , const size_t n1 ,
+                     const size_t n2 , const size_t n3 ,
+                     const size_t n4 , const size_t n5 ,
+                     const size_t n6 , const size_t n7 ,
+                     const size_t arg_rank ,
+                     const size_t i0 , const size_t i1 ,
+                     const size_t i2 , const size_t i3 ,
+                     const size_t i4 , const size_t i5 ,
+                     const size_t i6 , const size_t i7 )
+    {
+      AssertShapeBoundsAbort< Kokkos::HostSpace >
+        ::apply( rank ,    n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 ,
+                 arg_rank, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+    }
+};
+
+template< class ShapeType >
+KOKKOS_INLINE_FUNCTION
+void assert_shape_bounds( const ShapeType & shape ,
+                          const size_t arg_rank ,
+                          const size_t i0 ,
+                          const size_t i1 = 0 ,
+                          const size_t i2 = 0 ,
+                          const size_t i3 = 0 ,
+                          const size_t i4 = 0 ,
+                          const size_t i5 = 0 ,
+                          const size_t i6 = 0 ,
+                          const size_t i7 = 0 )
+{
+  // Must supply at least as many indices as ranks.
+  // Every index must be within bounds.
+  const bool ok = ShapeType::rank <= arg_rank &&
+                  i0 < shape.N0 && 
+                  i1 < shape.N1 &&
+                  i2 < shape.N2 &&
+                  i3 < shape.N3 &&
+                  i4 < shape.N4 &&
+                  i5 < shape.N5 &&
+                  i6 < shape.N6 &&
+                  i7 < shape.N7 ;
+
+  if ( ! ok ) {
+    AssertShapeBoundsAbort< Kokkos::Impl::ActiveExecutionMemorySpace >
+      ::apply( ShapeType::rank ,
+               shape.N0 , shape.N1 , shape.N2 , shape.N3 ,
+               shape.N4 , shape.N5 , shape.N6 , shape.N7 ,
+               arg_rank , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+  }
+}
+
+#if defined( KOKKOS_EXPRESSION_CHECK )
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) assert_shape_bounds(S,1,I0);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) assert_shape_bounds(S,2,I0,I1);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) assert_shape_bounds(S,3,I0,I1,I2);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) assert_shape_bounds(S,4,I0,I1,I2,I3);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) assert_shape_bounds(S,5,I0,I1,I2,I3,I4);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) assert_shape_bounds(S,6,I0,I1,I2,I3,I4,I5);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) assert_shape_bounds(S,7,I0,I1,I2,I3,I4,I5,I6);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) assert_shape_bounds(S,8,I0,I1,I2,I3,I4,I5,I6,I7);
+#else
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) /* */
+#endif
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Specialization and optimization for the Rank 0 shape.
+
+template < unsigned ScalarSize >
+struct Shape< ScalarSize , 0, 1,1,1,1, 1,1,1,1 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 0 };
+  enum { rank         = 0 };
+
+  enum { N0 = 1 };
+  enum { N1 = 1 };
+  enum { N2 = 1 };
+  enum { N3 = 1 };
+  enum { N4 = 1 };
+  enum { N5 = 1 };
+  enum { N6 = 1 };
+  enum { N7 = 1 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  {}
+};
+
+//----------------------------------------------------------------------------
+// All-static dimension array
+
+template < unsigned ScalarSize ,
+           unsigned Rank ,
+           unsigned s0 ,
+           unsigned s1 ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape {
+
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 0 };
+  enum { rank         = Rank };
+
+  enum { N0 = s0 };
+  enum { N1 = s1 };
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  {}
+};
+
+// 1 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize ,
+           unsigned Rank ,
+           unsigned s1 ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,s1,s2,s3, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 1 };
+  enum { rank         = Rank };
+
+  size_t N0 ; // For 1 == dynamic_rank allow  N0 > 2^32
+
+  enum { N1 = s1 };
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               size_t n0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; }
+};
+
+// 2 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,s2,s3, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 2 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; }
+};
+
+// 3 == dynamic_rank <= rank <= 8
+template < unsigned Rank , unsigned ScalarSize ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,s3, s4,s5,s6,s7>
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 3 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; }
+};
+
+// 4 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank, 0,0,0,0, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 4 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; }
+};
+
+// 5 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 5 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; s.N4 = n4 ; }
+};
+
+// 6 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 6 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 = 0 , unsigned = 0 , unsigned = 0 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ;
+  }
+};
+
+// 7 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,0,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 7 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 , unsigned n6 , unsigned = 0 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ;
+  }
+};
+
+// 8 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize >
+struct Shape< ScalarSize , 8 , 0,0,0,0, 0,0,0,0 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 8 };
+  enum { rank         = 8 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+  unsigned N7 ;
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ; s.N7 = n7 ;
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class ShapeType , unsigned N ,
+          unsigned R = ShapeType::rank_dynamic >
+struct ShapeInsert ;
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 0 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 N ,
+                 ShapeType::N0 ,
+                 ShapeType::N1 ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 1 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 N ,
+                 ShapeType::N1 ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 2 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 3 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 4 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 5 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 6 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 7 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N > type ;
+};
+
+//----------------------------------------------------------------------------
+
+template< class DstShape , class SrcShape ,
+          unsigned DstRankDynamic   = DstShape::rank_dynamic ,
+          bool     DstRankDynamicOK = unsigned(DstShape::rank_dynamic) >= unsigned(SrcShape::rank_dynamic) >
+struct ShapeCompatible { enum { value = false }; };
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 8 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 7 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 6 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 5 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 4 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 3 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 2 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 1 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 0 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N0) == unsigned(SrcShape::N0) &&
+                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< unsigned ScalarSize , unsigned Rank ,
+          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
+          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 ,
+          typename iType >
+KOKKOS_INLINE_FUNCTION
+size_t dimension( 
+  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape ,
+  const iType & r )
+{
+  return 0 == r ? shape.N0 : (
+         1 == r ? shape.N1 : (
+         2 == r ? shape.N2 : (
+         3 == r ? shape.N3 : (
+         4 == r ? shape.N4 : (
+         5 == r ? shape.N5 : (
+         6 == r ? shape.N6 : (
+         7 == r ? shape.N7 : 1 )))))));
+}
+
+template< unsigned ScalarSize , unsigned Rank ,
+          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
+          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 >
+KOKKOS_INLINE_FUNCTION
+size_t cardinality_count(
+  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape )
+{
+  return size_t(shape.N0) * shape.N1 * shape.N2 * shape.N3 *
+         shape.N4 * shape.N5 * shape.N6 * shape.N7 ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_CORESHAPE_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
@ -0,0 +1,79 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STATICASSERT_HPP
+#define KOKKOS_STATICASSERT_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template < bool , class T = void >
+struct StaticAssert ;
+
+template< class T >
+struct StaticAssert< true , T > {
+  typedef T type ;
+  static const bool value = true ;
+};
+
+template < class A , class B >
+struct StaticAssertSame ;
+
+template < class A >
+struct StaticAssertSame<A,A> { typedef A type ; };
+
+template < class A , class B >
+struct StaticAssertAssignable ;
+
+template < class A >
+struct StaticAssertAssignable<A,A> { typedef A type ; };
+
+template < class A >
+struct StaticAssertAssignable< const A , A > { typedef const A type ; };
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* KOKKOS_STATICASSERT_HPP */
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
@ -0,0 +1,110 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TAGS_HPP
+#define KOKKOS_TAGS_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct LayoutTag {};
+
+struct MemorySpaceTag {};
+struct MemoryTraitsTag {};
+
+struct ExecutionPolicyTag {};
+struct ExecutionSpaceTag {};
+
+
+template< class C , class Enable = void >
+struct is_memory_space : public bool_< false > {};
+
+template< class C >
+struct is_memory_space< C , typename Impl::enable_if_type< typename C::kokkos_tag >::type >
+  : public bool_< Impl::is_same< typename C::kokkos_tag , Impl::MemorySpaceTag >::value > {};
+
+
+template< class C , class Enable = void >
+struct is_execution_space : public bool_< false > {};
+
+template< class C >
+struct is_execution_space< C , typename Impl::enable_if_type< typename C::kokkos_tag >::type >
+  : public bool_< Impl::is_same< typename C::kokkos_tag , Impl::ExecutionSpaceTag >::value > {};
+
+
+template< class C , class Enable = void >
+struct is_execution_policy : public bool_< false > {};
+
+template< class C >
+struct is_execution_policy< C , typename Impl::enable_if_type< typename C::kokkos_tag >::type >
+  : public bool_< Impl::is_same< typename C::kokkos_tag , Impl::ExecutionPolicyTag >::value > {};
+
+
+template< class C , class Enable = void >
+struct is_layout : public Impl::false_type {};
+
+template<class C>
+struct is_layout<C,typename Impl::enable_if_type< typename C::kokkos_tag >::type > {
+  enum {value=bool(Impl::is_same<Impl::LayoutTag,typename C::kokkos_tag>::value)};
+};
+
+
+template< class C , class Enable = void >
+struct is_memorytraits : public Impl::false_type {};
+
+template<class C>
+struct is_memorytraits<C,typename Impl::enable_if_type< typename C::kokkos_tag >::type > {
+  enum {value=bool(Impl::is_same<Impl::MemoryTraitsTag,typename C::kokkos_tag>::value)};
+};
+
+}
+}
+
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
@ -0,0 +1,115 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPLWALLTIME_HPP
+#define KOKKOS_IMPLWALLTIME_HPP
+
+#include <stddef.h>
+
+#ifdef _MSC_VER
+#undef KOKKOS_USE_LIBRT
+#include <gettimeofday.c>
+#else
+#ifdef KOKKOS_USE_LIBRT
+#include <ctime>
+#else
+#include <sys/time.h>
+#endif
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Time since construction */
+
+class Timer {
+private:
+  #ifdef KOKKOS_USE_LIBRT
+	struct timespec m_old;
+  #else
+	struct timeval m_old ;
+  #endif
+  Timer( const Timer & );
+  Timer & operator = ( const Timer & );
+public:
+
+  inline
+  void reset() {
+    #ifdef KOKKOS_USE_LIBRT
+	  clock_gettime(CLOCK_REALTIME, &m_old);
+    #else
+	  gettimeofday( & m_old , ((struct timezone *) NULL ) );
+    #endif
+  }
+
+  inline
+  ~Timer() {}
+
+  inline
+  Timer() { reset(); }
+
+  inline
+  double seconds() const
+  {
+    #ifdef KOKKOS_USE_LIBRT
+      struct timespec m_new;
+      clock_gettime(CLOCK_REALTIME, &m_new);
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 );
+    #else
+      struct timeval m_new ;
+
+      ::gettimeofday( & m_new , ((struct timezone *) NULL ) );
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 );
+    #endif
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
@ -0,0 +1,332 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSTRAITS_HPP
+#define KOKKOSTRAITS_HPP
+
+#include <stddef.h>
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+
+namespace Kokkos {
+namespace Impl {
+
+/* C++11 conformal compile-time type traits utilities.
+ * Prefer to use C++11 when portably available.
+ */
+//----------------------------------------------------------------------------
+// C++11 Helpers:
+
+template < class T , T v >
+struct integral_constant
+{
+  // Declaration of 'static const' causes an unresolved linker symbol in debug
+  // static const T value = v ;
+  enum { value = T(v) };
+  typedef T value_type;
+  typedef integral_constant<T,v> type;
+  KOKKOS_INLINE_FUNCTION operator T() { return v ; }
+};
+
+typedef integral_constant<bool,false> false_type ;
+typedef integral_constant<bool,true>  true_type ;
+
+//----------------------------------------------------------------------------
+// C++11 Type relationships:
+
+template< class X , class Y > struct is_same : public false_type {};
+template< class X >           struct is_same<X,X> : public true_type {};
+
+//----------------------------------------------------------------------------
+// C++11 Type properties:
+
+template <typename T> struct is_const : public false_type {};
+template <typename T> struct is_const<const T> : public true_type {};
+template <typename T> struct is_const<const T & > : public true_type {};
+
+//----------------------------------------------------------------------------
+// C++11 Type transformations:
+
+template <typename T> struct remove_const { typedef T type; };
+template <typename T> struct remove_const<const T> { typedef T type; };
+template <typename T> struct remove_const<const T & > { typedef T & type; };
+
+template <typename T> struct add_const { typedef const T type; };
+template <typename T> struct add_const<T & > { typedef const T & type; };
+template <typename T> struct add_const<const T> { typedef const T type; };
+template <typename T> struct add_const<const T & > { typedef const T & type; };
+
+template<typename T> struct remove_reference { typedef T type ; };
+template<typename T> struct remove_reference< T & > { typedef T type ; };
+template<typename T> struct remove_reference< const T & > { typedef const T type ; };
+
+//----------------------------------------------------------------------------
+// C++11 Other type generators:
+
+template< bool , class T , class F >
+struct condition { typedef F type ; };
+
+template< class T , class F >
+struct condition<true,T,F> { typedef T type ; };
+
+template< bool , class = void >
+struct enable_if ;
+
+template< class T >
+struct enable_if< true , T > { typedef T type ; };
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Other traits
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class , class T = void >
+struct enable_if_type { typedef T type ; };
+
+//----------------------------------------------------------------------------
+
+template< bool B >
+struct bool_ : public integral_constant<bool,B> {};
+
+template< unsigned I >
+struct unsigned_ : public integral_constant<unsigned,I> {};
+
+template< int I >
+struct int_ : public integral_constant<int,I> {};
+
+typedef bool_<true> true_;
+typedef bool_<false> false_;
+//----------------------------------------------------------------------------
+// if_
+
+template < bool Cond , typename TrueType , typename FalseType>
+struct if_c
+{
+  enum { value = Cond };
+
+  typedef FalseType type;
+
+
+  typedef typename remove_const<
+          typename remove_reference<type>::type >::type value_type ;
+
+  typedef typename add_const<value_type>::type const_value_type ;
+
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v ) { return v ; }
+
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
+
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const T & , const_value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & , value_type & v ) { return v ; }
+};
+
+template <typename TrueType, typename FalseType>
+struct if_c< true , TrueType , FalseType >
+{
+  enum { value = true };
+
+  typedef TrueType type;
+
+
+  typedef typename remove_const<
+          typename remove_reference<type>::type >::type value_type ;
+
+  typedef typename add_const<value_type>::type const_value_type ;
+
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v ) { return v ; }
+
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
+
+
+  template< class F >
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v , const F & ) { return v ; }
+
+  template< class F >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v , const F & ) { return v ; }
+};
+
+template< typename TrueType >
+struct if_c< false , TrueType , void >
+{
+  enum { value = false };
+
+  typedef void type ;
+  typedef void value_type ;
+};
+
+template< typename FalseType >
+struct if_c< true , void , FalseType >
+{
+  enum { value = true };
+
+  typedef void type ;
+  typedef void value_type ;
+};
+
+template <typename Cond, typename TrueType, typename FalseType>
+struct if_ : public if_c<Cond::value, TrueType, FalseType> {};
+
+//----------------------------------------------------------------------------
+
+template < size_t N >
+struct is_power_of_two
+{
+  enum type { value = (N > 0) && !(N & (N-1)) };
+};
+
+template < size_t N , bool OK = is_power_of_two<N>::value >
+struct power_of_two ;
+
+template < size_t N >
+struct power_of_two<N,true>
+{
+  enum type { value = 1+ power_of_two<(N>>1),true>::value };
+};
+
+template <>
+struct power_of_two<2,true>
+{
+  enum type { value = 1 };
+};
+
+template <>
+struct power_of_two<1,true>
+{
+  enum type { value = 0 };
+};
+
+/** \brief  If power of two then return power,
+ *          otherwise return ~0u.
+ */
+static KOKKOS_FORCEINLINE_FUNCTION
+unsigned power_of_two_if_valid( const unsigned N )
+{
+  unsigned p = ~0u ;
+  if ( N && ! ( N & ( N - 1 ) ) ) {
+#if defined( __CUDA_ARCH__ )
+    p = __ffs(N) - 1 ;
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+    p = __builtin_ffs(N) - 1 ;
+#elif defined( __INTEL_COMPILER )
+    p = _bit_scan_forward(N);
+#else
+    p = 0 ;
+    for ( unsigned j = 1 ; ! ( N & j ) ; j <<= 1 ) { ++p ; }
+#endif
+  }
+  return p ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T , T v , bool NonZero = ( v != T(0) ) >
+struct integral_nonzero_constant
+{
+  // Declaration of 'static const' causes an unresolved linker symbol in debug
+  // static const T value = v ;
+  enum { value = T(v) };
+  typedef T value_type ;
+  typedef integral_nonzero_constant<T,v> type ;
+  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & ) {}
+};
+
+template< typename T , T zero >
+struct integral_nonzero_constant<T,zero,false>
+{
+  const T value ;
+  typedef T value_type ;
+  typedef integral_nonzero_constant<T,0> type ;
+  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & v ) : value(v) {}
+};
+
+//----------------------------------------------------------------------------
+
+template <typename T> struct is_integral : public false_ {};
+
+template <> struct is_integral<int8_t>  : public true_ {};
+template <> struct is_integral<int16_t> : public true_ {};
+template <> struct is_integral<int32_t> : public true_ {};
+template <> struct is_integral<int64_t> : public true_ {};
+
+template <> struct is_integral<uint8_t>  : public true_ {};
+template <> struct is_integral<uint16_t> : public true_ {};
+template <> struct is_integral<uint32_t> : public true_ {};
+template <> struct is_integral<uint64_t> : public true_ {};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOSTRAITS_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
--- a/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp
--- a/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
@ -0,0 +1,317 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWSUPPORT_HPP
+#define KOKKOS_VIEWSUPPORT_HPP
+
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Evaluate if LHS = RHS view assignment is allowed. */
+template< class ViewLHS , class ViewRHS >
+struct ViewAssignable
+{
+  // Same memory space.
+  // Same value type.
+  // Compatible 'const' qualifier
+  // Cannot assign managed = unmannaged
+  enum { assignable_value =
+    ( is_same< typename ViewLHS::value_type ,
+               typename ViewRHS::value_type >::value
+      ||
+      is_same< typename ViewLHS::value_type ,
+               typename ViewRHS::const_value_type >::value )
+    &&
+    is_same< typename ViewLHS::memory_space ,
+             typename ViewRHS::memory_space >::value
+    &&
+    ( ! ( ViewLHS::is_managed && ! ViewRHS::is_managed ) )
+  };
+
+  enum { assignable_shape =
+    // Compatible shape and matching layout:
+    ( ShapeCompatible< typename ViewLHS::shape_type ,
+                       typename ViewRHS::shape_type >::value
+      &&
+      is_same< typename ViewLHS::array_layout ,
+               typename ViewRHS::array_layout >::value )
+    ||
+    // Matching layout, same rank, and LHS dynamic rank
+    ( is_same< typename ViewLHS::array_layout ,
+               typename ViewRHS::array_layout >::value
+      &&
+      int(ViewLHS::rank) == int(ViewRHS::rank)
+      &&
+      int(ViewLHS::rank) == int(ViewLHS::rank_dynamic) )
+    ||
+    // Both rank-0, any shape and layout
+    ( int(ViewLHS::rank) == 0 && int(ViewRHS::rank) == 0 )
+    ||
+    // Both rank-1 and LHS is dynamic rank-1, any shape and layout
+    ( int(ViewLHS::rank) == 1 && int(ViewRHS::rank) == 1 &&
+      int(ViewLHS::rank_dynamic) == 1 )
+    };
+
+  enum { value = assignable_value && assignable_shape };
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  View tracking increment/decrement only happens when
+ *          view memory is managed and executing in the host space.
+ */
+template< class ViewTraits , class Enable = void >
+struct ViewTracking {
+  KOKKOS_INLINE_FUNCTION void increment( const void * ) const {}
+  KOKKOS_INLINE_FUNCTION void decrement( const void * ) const {}
+
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const ViewTracking & ) { return *this ; }
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const ViewTracking<T> & ) { return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const bool ) { return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  operator bool() const { return false ; }
+};
+
+template< class ViewTraits >
+struct ViewTracking< ViewTraits , typename enable_if< ViewTraits::is_managed >::type >
+{
+private:
+
+  enum { is_host_space = is_same< Kokkos::HostSpace , Kokkos::Impl::ActiveExecutionMemorySpace >::value };
+
+  bool m_flag ;
+
+  struct NoType {};
+
+public:
+
+  typedef typename ViewTraits::memory_space memory_space ;
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  void increment( const T * ptr
+                , typename enable_if<( ! is_same<T,NoType>::value && is_host_space )>::type * = 0 ) const
+    { if ( m_flag ) memory_space::increment( ptr ); }
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  void increment( const T *
+                , typename enable_if<( ! is_same<T,NoType>::value && ! is_host_space )>::type * = 0 ) const
+    {}
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  void decrement( const T * ptr
+                , typename enable_if<( ! is_same<T,NoType>::value && is_host_space )>::type * = 0 ) const
+    { if ( m_flag ) memory_space::decrement( ptr ); }
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  void decrement( const T *
+                , typename enable_if<( ! is_same<T,NoType>::value && ! is_host_space )>::type * = 0 ) const
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking() : m_flag( true ) {}
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const ViewTracking & rhs ) { m_flag = rhs.m_flag ; return *this ; }
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const ViewTracking<T> & rhs ) { m_flag = rhs.operator bool(); return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const bool rhs ) { m_flag = rhs ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  operator bool() const { return m_flag ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class OutputView , class InputView  , unsigned Rank = OutputView::Rank >
+struct ViewRemap
+{
+  typedef typename OutputView::device_type device_type ;
+  typedef typename device_type::size_type  size_type ;
+
+  const OutputView output ;
+  const InputView  input ;
+  const size_type n0 ;
+  const size_type n1 ;
+  const size_type n2 ;
+  const size_type n3 ;
+  const size_type n4 ;
+  const size_type n5 ;
+  const size_type n6 ;
+  const size_type n7 ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+    : output( arg_out ), input( arg_in )
+    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
+    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
+    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
+    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
+    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
+    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
+    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
+    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
+    {
+      parallel_for( n0 , *this );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i0 ) const
+  {
+    for ( size_type i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_type i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_type i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_type i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_type i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_type i6 = 0 ; i6 < n6 ; ++i6 ) {
+    for ( size_type i7 = 0 ; i7 < n7 ; ++i7 ) {
+      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input.at(i0,i1,i2,i3,i4,i5,i6,i7);
+    }}}}}}}
+  }
+};
+
+template< class OutputView , class InputView  >
+struct ViewRemap< OutputView ,  InputView , 0 >
+{
+  typedef typename OutputView::value_type   value_type ;
+  typedef typename OutputView::memory_space dst_space ;
+  typedef typename InputView ::memory_space src_space ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+  {
+    DeepCopy< dst_space , src_space >( arg_out.ptr_on_device() ,
+                                       arg_in.ptr_on_device() ,
+                                       sizeof(value_type) );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class OutputView , unsigned Rank = OutputView::Rank >
+struct ViewFill
+{
+  typedef typename OutputView::device_type       device_type ;
+  typedef typename OutputView::const_value_type  const_value_type ;
+  typedef typename device_type::size_type        size_type ;
+
+  const OutputView output ;
+  const_value_type input ;
+
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+    : output( arg_out ), input( arg_in )
+    {
+      parallel_for( output.dimension_0() , *this );
+      device_type::fence();
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i0 ) const
+  {
+    for ( size_type i1 = 0 ; i1 < output.dimension_1() ; ++i1 ) {
+    for ( size_type i2 = 0 ; i2 < output.dimension_2() ; ++i2 ) {
+    for ( size_type i3 = 0 ; i3 < output.dimension_3() ; ++i3 ) {
+    for ( size_type i4 = 0 ; i4 < output.dimension_4() ; ++i4 ) {
+    for ( size_type i5 = 0 ; i5 < output.dimension_5() ; ++i5 ) {
+    for ( size_type i6 = 0 ; i6 < output.dimension_6() ; ++i6 ) {
+    for ( size_type i7 = 0 ; i7 < output.dimension_7() ; ++i7 ) {
+      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input ;
+    }}}}}}}
+  }
+};
+
+template< class OutputView >
+struct ViewFill< OutputView , 0 >
+{
+  typedef typename OutputView::device_type       device_type ;
+  typedef typename OutputView::const_value_type  const_value_type ;
+  typedef typename OutputView::memory_space      dst_space ;
+
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+  {
+    DeepCopy< dst_space , dst_space >( arg_out.ptr_on_device() , & arg_in ,
+                                       sizeof(const_value_type) );
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEWSUPPORT_HPP */
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
@ -0,0 +1,409 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWTILELEFT_HPP
+#define KOKKOS_VIEWTILELEFT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct ViewTileLeftFast ;
+struct ViewTileLeftSlow ;
+
+template< class ValueType , unsigned N0 , unsigned N1 , bool B , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< ValueType , void ,
+                       LayoutTileLeft<N0,N1,B> ,
+                       MemorySpace , MemoryTraits >
+{ typedef typename if_c< B , ViewTileLeftFast , ViewTileLeftSlow >::type type ; };
+
+//----------------------------------------------------------------------------
+
+template<>
+struct ViewAssignment< ViewTileLeftFast , void , void >
+{
+private:
+
+  template< class DT , class DL , class DD , class DM >
+  inline
+  void allocate( View<DT,DL,DD,DM,ViewTileLeftFast> & dst , const std::string label )
+  {
+    typedef View<DT,DL,DD,DM,ViewTileLeftFast>  DstViewType ;
+    typedef typename DstViewType::memory_space  memory_space ;
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_ptr_on_device = (typename DstViewType::value_type *)
+      memory_space::allocate( label ,
+                              typeid(typename DstViewType::value_type) ,
+                              sizeof(typename DstViewType::value_type) ,
+                              dst.capacity() );
+
+    ViewFill< DstViewType > init( dst , typename DstViewType::value_type() );
+  }
+
+public:
+
+  template< class DT , class DL , class DD , class DM >
+  inline
+  ViewAssignment( View<DT,DL,DD,DM,ViewTileLeftFast> & dst ,
+                  const typename enable_if< ViewTraits<DT,DL,DD,DM>::is_managed , std::string >::type & label ,
+                  const size_t n0 ,
+                  const size_t n1 ,
+                  const size_t = 0 ,
+                  const size_t = 0 ,
+                  const size_t = 0 ,
+                  const size_t = 0 ,
+                  const size_t = 0 ,
+                  const size_t = 0 )
+  {
+    typedef View<DT,DL,DD,DM,ViewTileLeftFast>  DstViewType ;
+
+    dst.m_shape.N0 = n0 ;
+    dst.m_shape.N1 = n1 ;
+    dst.m_tile_N0  = ( n0 + DstViewType::MASK_0 ) >> DstViewType::SHIFT_0 ;
+
+    allocate( dst , label );
+  }
+
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  ViewAssignment(       View<DT,DL,DD,DM,ViewTileLeftFast> & dst ,
+                  const View<ST,SL,SD,SM,ViewTileLeftFast> & src ,
+                  typename enable_if<
+                    is_same< View<DT,DL,DD,DM,ViewTileLeftFast> ,
+                             typename View<ST,SL,SD,SM,ViewTileLeftFast>::HostMirror >::value
+                  >::type * = 0 )
+  {
+    dst.m_shape   = src.m_shape ;
+    dst.m_tile_N0 = src.m_tile_N0 ;
+    allocate( dst , "mirror" );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+struct ViewAssignment< ViewTileLeftFast , ViewTileLeftFast, void >
+{
+  /** \brief Assign compatible views */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,ViewTileLeftFast> & dst ,
+                  const View<ST,SL,SD,SM,ViewTileLeftFast> & src ,
+                  const typename enable_if<(
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
+                  )>::type * = 0 )
+  {
+    typedef View<DT,DL,DD,DM,ViewTileLeftFast> DstViewType ;
+    typedef typename DstViewType::shape_type    shape_type ;
+    //typedef typename DstViewType::memory_space  memory_space ; // unused
+    //typedef typename DstViewType::memory_traits memory_traits ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    shape_type::assign( dst.m_shape, src.m_shape.N0 , src.m_shape.N1 );
+
+    dst.m_tracking       = src.m_tracking ;
+    dst.m_tile_N0       = src.m_tile_N0 ;
+    dst.m_ptr_on_device = src.m_ptr_on_device ;
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Deep copy data from compatible value type, layout, rank, and specialization.
+   *          Check the dimensions and allocation lengths at runtime.
+   */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  inline static
+  void deep_copy( const View<DT,DL,DD,DM,Impl::ViewTileLeftFast> & dst ,
+                  const View<ST,SL,SD,SM,Impl::ViewTileLeftFast> & src ,
+                  const typename Impl::enable_if<(
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::value_type ,
+                                   typename ViewTraits<ST,SL,SD,SM>::non_const_value_type >::value
+                    &&
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout ,
+                                   typename ViewTraits<ST,SL,SD,SM>::array_layout >::value
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) == unsigned(ViewTraits<ST,SL,SD,SM>::rank) )
+                  )>::type * = 0 )
+  {
+    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
+    typedef ViewTraits<ST,SL,SD,SM> src_traits ;
+
+    if ( dst.m_ptr_on_device != src.m_ptr_on_device ) {
+
+      Impl::assert_shapes_are_equal( dst.m_shape , src.m_shape );
+
+      const size_t n_dst = sizeof(typename dst_traits::value_type) * dst.capacity();
+      const size_t n_src = sizeof(typename src_traits::value_type) * src.capacity();
+
+      Impl::assert_counts_are_equal( n_dst , n_src );
+
+      DeepCopy< typename dst_traits::memory_space ,
+                typename src_traits::memory_space >( dst.m_ptr_on_device , src.m_ptr_on_device , n_dst );
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+struct ViewAssignment< ViewDefault , ViewTileLeftFast, void >
+{
+  /** \brief Extracting a single tile from a tiled view */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,ViewDefault> & dst ,
+                  const View<ST,SL,SD,SM,ViewTileLeftFast> & src ,
+                  const unsigned i0 ,
+                  const typename enable_if<(
+                    is_same< View<DT,DL,DD,DM,ViewDefault> ,
+                             typename View<ST,SL,SD,SM,ViewTileLeftFast>::tile_type >::value
+                  ), unsigned >::type i1 )
+  {
+    //typedef View<DT,DL,DD,DM,ViewDefault> DstViewType ; // unused
+    //typedef typename DstViewType::shape_type    shape_type ; // unused
+    //typedef typename DstViewType::memory_space  memory_space ; // unused
+    //typedef typename DstViewType::memory_traits memory_traits ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    enum { N0 = SL::N0 };
+    enum { N1 = SL::N1 };
+    enum { SHIFT_0 = power_of_two<N0>::value };
+    enum { MASK_0 = N0 - 1 };
+    enum { SHIFT_1 = power_of_two<N1>::value };
+
+    const unsigned NT0 = ( src.dimension_0() + MASK_0 ) >> SHIFT_0 ;
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_ptr_on_device = src.m_ptr_on_device + (( i0 + i1 * NT0 ) << ( SHIFT_0 + SHIFT_1 ));
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class DataType , class Arg1Type , class Arg2Type , class Arg3Type >
+class View< DataType , Arg1Type , Arg2Type , Arg3Type , Impl::ViewTileLeftFast >
+  : public ViewTraits< DataType , Arg1Type , Arg2Type , Arg3Type >
+{
+private:
+  template< class , class , class > friend struct Impl::ViewAssignment ;
+
+  typedef ViewTraits< DataType , Arg1Type , Arg2Type , Arg3Type > traits ;
+
+  typedef Impl::ViewAssignment<Impl::ViewTileLeftFast> alloc ;
+
+  typedef Impl::ViewAssignment<Impl::ViewTileLeftFast,
+                               Impl::ViewTileLeftFast> assign ;
+
+  typename traits::value_type * m_ptr_on_device ;
+  typename traits::shape_type   m_shape ;
+  unsigned                      m_tile_N0 ;
+  Impl::ViewTracking< traits >  m_tracking ;
+
+  typedef typename traits::array_layout layout ;
+
+  enum { SHIFT_0 = Impl::power_of_two<layout::N0>::value };
+  enum { SHIFT_1 = Impl::power_of_two<layout::N1>::value };
+  enum { MASK_0  = layout::N0 - 1 };
+  enum { MASK_1  = layout::N1 - 1 };
+
+public:
+
+  typedef Impl::ViewTileLeftFast specialize ;
+
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > const_type ;
+
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type::host_mirror_device_type ,
+                void > HostMirror ;
+
+  enum { Rank = 2 };
+
+  KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_shape ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_shape.N0 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_shape.N1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  View() : m_ptr_on_device(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~View() { m_tracking.decrement( m_ptr_on_device ); }
+
+  KOKKOS_INLINE_FUNCTION
+  View( const View & rhs ) : m_ptr_on_device(0) { (void)assign( *this , rhs ); }
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View & rhs ) { (void)assign( *this , rhs ); return *this ; }
+
+  //------------------------------------
+  // Array allocator and member access operator:
+
+  View( const std::string & label , const size_t n0 , const size_t n1 )
+    : m_ptr_on_device(0) { (void)alloc( *this , label , n0 , n1 ); }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename traits::value_type & operator()( const iType0 & i0 , const iType1 & i1 ) const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
+
+      // Use care to insert necessary parentheses as the
+      // shift operators have lower precedence than the arithmatic operators.
+
+      return m_ptr_on_device[
+        // ( ( Tile offset                               ) *  ( Tile size       ) )
+         + ( ( (i0>>SHIFT_0) + m_tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) )
+        // ( Offset within tile                       )
+         + ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ] ;
+    }
+
+  //------------------------------------
+  // Accept but ignore extra indices, they should be zero.
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename traits::value_type &
+    at( const iType0 & i0 , const iType1 & i1 , const int , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
+
+      // Use care to insert necessary parentheses as the
+      // shift operators have lower precedence than the arithmatic operators.
+
+      return m_ptr_on_device[
+        // ( ( Tile offset                               ) *  ( Tile size       ) )
+         + ( ( (i0>>SHIFT_0) + m_tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) )
+        // ( Offset within tile                       )
+         + ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ] ;
+    }
+
+  //------------------------------------
+  // Tile specialization specific declarations and functions:
+
+  typedef View< typename traits::value_type [ layout::N0 ][ layout::N1 ] ,
+                LayoutLeft ,
+                typename traits::device_type ,
+                MemoryUnmanaged >
+    tile_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::value_type * ptr_on_device() const { return m_ptr_on_device ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t tiles_in_dimension_0() const { return m_tile_N0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t tiles_in_dimension_1() const { return ( m_shape.N1 + MASK_1 ) >> SHIFT_1 ; }
+
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  size_t global_to_tile_index_0( const iType & global_i0 ) const
+    { return global_i0 >> SHIFT_0 ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  size_t global_to_tile_index_1( const iType & global_i1 ) const
+    { return global_i1 >> SHIFT_1 ; }
+
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  size_t global_to_local_tile_index_0( const iType & global_i0 ) const
+    { return global_i0 & MASK_0 ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  size_t global_to_local_tile_index_1( const iType & global_i1 ) const
+    { return global_i1 & MASK_1 ; }
+
+
+  //------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type capacity() const
+  {
+    return ( m_tile_N0 * ( ( m_shape.N1 + MASK_1 ) >> SHIFT_1 ) ) << ( SHIFT_0 + SHIFT_1 );
+  }
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEWTILELEFT_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
@ -0,0 +1,242 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_VOLATILE_LOAD )
+#define KOKKOS_VOLATILE_LOAD
+
+#if defined( __GNUC__ ) /* GNU C   */ || \
+    defined( __GNUG__ ) /* GNU C++ */ || \
+    defined( __clang__ )
+
+#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__))
+
+#else
+
+#define KOKKOS_MAY_ALIAS
+
+#endif
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+T volatile_load(T const volatile * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const volatile * const ptr;
+    T64 const volatile * const ptr64;
+    T32 const volatile * const ptr32;
+    T16 const volatile * const ptr16;
+    T8  const volatile * const ptr8;
+  } src = {src_ptr};
+
+  T result;
+
+  union {
+    T   * const ptr;
+    T64 * const ptr64;
+    T32 * const ptr32;
+    T16 * const ptr16;
+    T8  * const ptr8;
+  } dst = {&result};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+
+  return result;
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * const dst_ptr, T const volatile * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const volatile * const ptr;
+    T64 const volatile * const ptr64;
+    T32 const volatile * const ptr32;
+    T16 const volatile * const ptr16;
+    T8  const volatile * const ptr8;
+  } src = {src_ptr};
+
+  union {
+    T   volatile * const ptr;
+    T64 volatile * const ptr64;
+    T32 volatile * const ptr32;
+    T16 volatile * const ptr16;
+    T8  volatile * const ptr8;
+  } dst = {dst_ptr};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * const dst_ptr, T const * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const * const ptr;
+    T64 const * const ptr64;
+    T32 const * const ptr32;
+    T16 const * const ptr16;
+    T8  const * const ptr8;
+  } src = {src_ptr};
+
+  union {
+    T   volatile * const ptr;
+    T64 volatile * const ptr64;
+    T32 volatile * const ptr32;
+    T16 volatile * const ptr16;
+    T8  volatile * const ptr8;
+  } dst = {dst_ptr};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * dst_ptr, T const volatile & src)
+{ volatile_store(dst_ptr, &src); }
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * dst_ptr, T const & src)
+{ volatile_store(dst_ptr, &src); }
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+T safe_load(T const * const ptr)
+{
+#if !defined( __MIC__ )
+  return *ptr;
+#else
+  return volatile_load(ptr);
+#endif
+}
+
+} // namespace kokkos
+
+#undef KOKKOS_MAY_ALIAS
+
+#endif
+
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
@ -0,0 +1,700 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define DEBUG_PRINT 0
+
+#include <iostream>
+#include <sstream>
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace hwloc {
+
+/* Return 0 if asynchronous, 1 if synchronous and include process. */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] )
+{
+  const bool     hwloc_avail            = Kokkos::hwloc::available();
+  const unsigned avail_numa_count       = hwloc_avail ? hwloc::get_available_numa_count() : 1 ;
+  const unsigned avail_cores_per_numa   = hwloc_avail ? hwloc::get_available_cores_per_numa() : thread_count ;
+  const unsigned avail_threads_per_core = hwloc_avail ? hwloc::get_available_threads_per_core() : 1 ;
+
+  // (numa,core) coordinate of the process:
+  const std::pair<unsigned,unsigned> proc_coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+  //------------------------------------------------------------------------
+  // Defaults for unspecified inputs:
+
+  if ( ! use_numa_count ) {
+    // Default to use all NUMA regions
+    use_numa_count = ! thread_count ? avail_numa_count : (
+                       thread_count < avail_numa_count ? thread_count : avail_numa_count );
+  }
+
+  if ( ! use_cores_per_numa ) {
+    // Default to use all but one core if asynchronous, all cores if synchronous.
+    const unsigned threads_per_numa = thread_count / use_numa_count ;
+
+    use_cores_per_numa = ! threads_per_numa ? avail_cores_per_numa - ( allow_async ? 1 : 0 ) : (
+                           threads_per_numa < avail_cores_per_numa ? threads_per_numa : avail_cores_per_numa );
+  }
+
+  if ( ! thread_count ) {
+    thread_count = use_numa_count * use_cores_per_numa * avail_threads_per_core ;
+  }
+
+  //------------------------------------------------------------------------
+  // Input verification:
+
+  const bool valid_numa      = use_numa_count <= avail_numa_count ;
+  const bool valid_cores     = use_cores_per_numa &&
+                               use_cores_per_numa <= avail_cores_per_numa ;
+  const bool valid_threads   = thread_count &&
+                               thread_count <= use_numa_count * use_cores_per_numa * avail_threads_per_core ;
+  const bool balanced_numa   = ! ( thread_count % use_numa_count );
+  const bool balanced_cores  = ! ( thread_count % ( use_numa_count * use_cores_per_numa ) );
+
+  const bool valid_input = valid_numa && valid_cores && valid_threads && balanced_numa && balanced_cores ;
+
+  if ( ! valid_input ) {
+
+    std::ostringstream msg ;
+
+    msg << label << " HWLOC ERROR(s)" ;
+
+    if ( ! valid_threads ) {
+      msg << " : thread_count(" << thread_count
+          << ") exceeds capacity("
+          << use_numa_count * use_cores_per_numa * avail_threads_per_core
+          << ")" ;
+    }
+    if ( ! valid_numa ) {
+      msg << " : use_numa_count(" << use_numa_count
+          << ") exceeds capacity(" << avail_numa_count << ")" ;
+    }
+    if ( ! valid_cores ) {
+      msg << " : use_cores_per_numa(" << use_cores_per_numa
+          << ") exceeds capacity(" << avail_cores_per_numa << ")" ;
+    }
+    if ( ! balanced_numa ) {
+      msg << " : thread_count(" << thread_count
+          << ") imbalanced among numa(" << use_numa_count << ")" ;
+    }
+    if ( ! balanced_cores ) {
+      msg << " : thread_count(" << thread_count
+          << ") imbalanced among cores(" << use_numa_count * use_cores_per_numa << ")" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  const unsigned thread_spawn_synchronous =
+    ( allow_async &&
+      1 < thread_count &&
+      ( use_numa_count     < avail_numa_count ||
+        use_cores_per_numa < avail_cores_per_numa ) )
+     ? 0 /* asyncronous */
+     : 1 /* synchronous, threads_coord[0] is process core */ ;
+
+  // Determine binding coordinates for to-be-spawned threads so that
+  // threads may be bound to cores as they are spawned.
+
+  const unsigned threads_per_core = thread_count / ( use_numa_count * use_cores_per_numa );
+
+  if ( thread_spawn_synchronous ) {
+    // Working synchronously and include process core as threads_coord[0].
+    // Swap the NUMA coordinate of the process core with 0
+    // Swap the CORE coordinate of the process core with 0
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = 0 == inuma ? proc_coord.first : ( proc_coord.first == inuma ? 0 : inuma );
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = 0 == icore ? proc_coord.second : ( proc_coord.second == icore ? 0 : icore );
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+  else if ( use_numa_count < avail_numa_count ) {
+    // Working asynchronously and omit the process' NUMA region from the pool.
+    // Swap the NUMA coordinate of the process core with ( ( avail_numa_count - use_numa_count ) - 1 )
+    const unsigned numa_coord_swap = ( avail_numa_count - use_numa_count ) - 1 ;
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = proc_coord.first == inuma ? numa_coord_swap : inuma ;
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = icore ;
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+  else if ( use_cores_per_numa < avail_cores_per_numa ) {
+    // Working asynchronously and omit the process' core from the pool.
+    // Swap the CORE coordinate of the process core with ( ( avail_cores_per_numa - use_cores_per_numa ) - 1 )
+    const unsigned core_coord_swap = ( avail_cores_per_numa - use_cores_per_numa ) - 1 ;
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = inuma ;
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = proc_coord.second == icore ? core_coord_swap : icore ;
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+
+  return thread_spawn_synchronous ;
+}
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_HAVE_HWLOC )
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+/*--------------------------------------------------------------------------*/
+/* Third Party Libraries */
+
+/* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */
+#include <hwloc.h>
+
+#define  REQUIRED_HWLOC_API_VERSION  0x000010300
+
+#if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION
+#error "Requires  http://www.open-mpi.org/projects/hwloc/  Version 1.3 or greater"
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace hwloc {
+namespace {
+
+inline
+void print_bitmap( std::ostream & s , const hwloc_const_bitmap_t bitmap )
+{
+  s << "{" ;
+  for ( int i = hwloc_bitmap_first( bitmap ) ;
+        -1 != i ; i = hwloc_bitmap_next( bitmap , i ) ) {
+    s << " " << i ;
+  }
+  s << " }" ;
+}
+
+enum { MAX_CORE = 1024 };
+
+std::pair<unsigned,unsigned> s_core_topology(0,0);
+unsigned                     s_core_capacity(0);
+hwloc_topology_t             s_hwloc_topology(0);
+hwloc_bitmap_t               s_hwloc_location(0);
+hwloc_bitmap_t               s_process_binding(0);
+hwloc_bitmap_t               s_core[ MAX_CORE ];
+
+struct Sentinel {
+  ~Sentinel();
+  Sentinel();
+};
+
+bool sentinel()
+{
+  static Sentinel self ;
+
+  if ( 0 == s_hwloc_topology ) {
+    std::cerr << "Kokkos::hwloc ERROR : Called after return from main()" << std::endl ;
+    std::cerr.flush();
+  }
+
+  return 0 != s_hwloc_topology ;
+}
+
+Sentinel::~Sentinel()
+{
+  hwloc_topology_destroy( s_hwloc_topology );
+  hwloc_bitmap_free( s_process_binding );
+  hwloc_bitmap_free( s_hwloc_location );
+
+  s_core_topology.first  = 0 ;
+  s_core_topology.second = 0 ;
+  s_core_capacity   = 0 ;
+  s_hwloc_topology  = 0 ;
+  s_hwloc_location  = 0 ;
+  s_process_binding = 0 ;
+}
+
+Sentinel::Sentinel()
+{
+#if defined(__MIC__)
+  static const bool remove_core_0 = true ;
+#else
+  static const bool remove_core_0 = false ;
+#endif
+
+  s_core_topology   = std::pair<unsigned,unsigned>(0,0);
+  s_core_capacity   = 0 ;
+  s_hwloc_topology  = 0 ;
+  s_hwloc_location  = 0 ;
+  s_process_binding = 0 ;
+
+  for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ;
+
+  hwloc_topology_init( & s_hwloc_topology );
+  hwloc_topology_load( s_hwloc_topology );
+
+  s_hwloc_location  = hwloc_bitmap_alloc();
+  s_process_binding = hwloc_bitmap_alloc();
+
+  hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
+
+  if ( remove_core_0 ) {
+
+    const hwloc_obj_t core = hwloc_get_obj_by_type( s_hwloc_topology , HWLOC_OBJ_CORE , 0 );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+      hwloc_bitmap_t s_process_no_core_zero = hwloc_bitmap_alloc();
+
+      hwloc_bitmap_andnot( s_process_no_core_zero , s_process_binding , core->allowed_cpuset );
+
+      bool ok = 0 == hwloc_set_cpubind( s_hwloc_topology ,
+                                        s_process_no_core_zero ,
+                                        HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT );
+
+      if ( ok ) {
+        hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
+
+        ok = 0 != hwloc_bitmap_isequal( s_process_binding , s_process_no_core_zero );
+      }
+
+      hwloc_bitmap_free( s_process_no_core_zero );
+
+      if ( ! ok ) {
+        std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
+      }
+    }
+  }
+
+  // Choose a hwloc object type for the NUMA level, which may not exist.
+
+  hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ;
+
+  {
+    // Object types to search, in order.
+    static const hwloc_obj_type_t candidate_root_type[] =
+      { HWLOC_OBJ_NODE     /* NUMA region     */
+      , HWLOC_OBJ_SOCKET   /* hardware socket */
+      , HWLOC_OBJ_MACHINE  /* local machine   */
+      };
+
+    enum { CANDIDATE_ROOT_TYPE_COUNT =
+             sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) };
+
+    for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) {
+      if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) {
+        root_type = candidate_root_type[k] ;
+      }
+    }
+  }
+
+  // Determine which of these 'root' types are available to this process.
+  // The process may have been bound (e.g., by MPI) to a subset of these root types.
+  // Determine current location of the master (calling) process>
+
+  hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc();
+
+  hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD );
+
+  const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type );
+
+  unsigned root_base     = max_root ;
+  unsigned root_count    = 0 ;
+  unsigned core_per_root = 0 ;
+  unsigned pu_per_core   = 0 ;
+  bool     symmetric     = true ;
+
+  for ( unsigned i = 0 ; i < max_root ; ++i ) {
+
+    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
+
+      ++root_count ;
+
+      // Remember which root (NUMA) object the master thread is running on.
+      // This will be logical NUMA rank #0 for this process.
+
+      if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) {
+        root_base = i ;
+      }
+
+      // Count available cores:
+
+      const unsigned max_core =
+        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                root->allowed_cpuset ,
+                                                HWLOC_OBJ_CORE );
+
+      unsigned core_count = 0 ;
+
+      for ( unsigned j = 0 ; j < max_core ; ++j ) {
+
+        const hwloc_obj_t core =
+          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
+                                               root->allowed_cpuset ,
+                                               HWLOC_OBJ_CORE , j );
+
+        // If process' cpuset intersects core's cpuset then process can access this core.
+        // Must use intersection instead of inclusion because the Intel-Phi
+        // MPI may bind the process to only one of the core's hyperthreads.
+        //
+        // Assumption: if the process can access any hyperthread of the core
+        // then it has ownership of the entire core.
+        // This assumes that it would be performance-detrimental
+        // to spawn more than one MPI process per core and use nested threading.
+
+        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+          ++core_count ;
+
+          const unsigned pu_count =
+            hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                    core->allowed_cpuset ,
+                                                    HWLOC_OBJ_PU );
+
+          if ( pu_per_core == 0 ) pu_per_core = pu_count ;
+
+          // Enforce symmetry by taking the minimum:
+
+          pu_per_core = std::min( pu_per_core , pu_count );
+
+          if ( pu_count != pu_per_core ) symmetric = false ;
+        }
+      }
+
+      if ( 0 == core_per_root ) core_per_root = core_count ;
+
+      // Enforce symmetry by taking the minimum:
+
+      core_per_root = std::min( core_per_root , core_count );
+
+      if ( core_count != core_per_root ) symmetric = false ;
+    }
+  }
+
+  s_core_topology.first  = root_count ;
+  s_core_topology.second = core_per_root ;
+  s_core_capacity        = pu_per_core ;
+
+  // Fill the 's_core' array for fast mapping from a core coordinate to the
+  // hwloc cpuset object required for thread location querying and binding.
+
+  for ( unsigned i = 0 ; i < max_root ; ++i ) {
+
+    const unsigned root_rank = ( i + root_base ) % max_root ;
+
+    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
+
+      const unsigned max_core =
+        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                root->allowed_cpuset ,
+                                                HWLOC_OBJ_CORE );
+
+      unsigned core_count = 0 ;
+
+      for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) {
+
+        const hwloc_obj_t core =
+          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
+                                               root->allowed_cpuset ,
+                                               HWLOC_OBJ_CORE , j );
+
+        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+          s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ;
+
+          ++core_count ;
+        }
+      }
+    }
+  }
+
+  hwloc_bitmap_free( proc_cpuset_location );
+
+  if ( ! symmetric ) {
+    std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
+              << std::endl ;
+  }
+}
+
+
+} // namespace
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+bool available()
+{ return true ; }
+
+unsigned get_available_numa_count()
+{ sentinel(); return s_core_topology.first ; }
+
+unsigned get_available_cores_per_numa()
+{ sentinel(); return s_core_topology.second ; }
+
+unsigned get_available_threads_per_core()
+{ sentinel(); return s_core_capacity ; }
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+unsigned bind_this_thread(
+  const unsigned               coordinate_count ,
+  std::pair<unsigned,unsigned> coordinate[] )
+{
+  unsigned i = 0 ;
+
+  try {
+    const std::pair<unsigned,unsigned> current = get_this_thread_coordinate();
+
+    // Match one of the requests:
+    for ( i = 0 ; i < coordinate_count && current != coordinate[i] ; ++i );
+
+    if ( coordinate_count == i ) {
+      // Match the first request (typically NUMA):
+      for ( i = 0 ; i < coordinate_count && current.first != coordinate[i].first ; ++i );
+    }
+
+    if ( coordinate_count == i ) {
+      // Match any unclaimed request:
+      for ( i = 0 ; i < coordinate_count && ~0u == coordinate[i].first  ; ++i );
+    }
+
+    if ( coordinate_count == i || ! bind_this_thread( coordinate[i] ) ) {
+       // Failed to bind:
+       i = ~0u ;
+    }
+
+    if ( i < coordinate_count ) {
+
+#if DEBUG_PRINT
+      if ( current != coordinate[i] ) {
+        std::cout << "  bind_this_thread: rebinding from ("
+                  << current.first << ","
+                  << current.second
+                  << ") to ("
+                  << coordinate[i].first << ","
+                  << coordinate[i].second
+                  << ")" << std::endl ;
+      }
+#endif
+
+      coordinate[i].first  = ~0u ;
+      coordinate[i].second = ~0u ;
+    }
+  }
+  catch( ... ) {
+    i = ~0u ;
+  }
+
+  return i ;
+}
+
+
+bool bind_this_thread( const std::pair<unsigned,unsigned> coord )
+{
+  if ( ! sentinel() ) return false ;
+
+#if DEBUG_PRINT
+
+  std::cout << "Kokkos::bind_this_thread() at " ;
+
+  hwloc_get_last_cpu_location( s_hwloc_topology ,
+                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+  std::cout << " to " ;
+
+  print_bitmap( std::cout , s_core[ coord.second + coord.first * s_core_topology.second ] );
+
+  std::cout << std::endl ;
+
+#endif
+
+  // As safe and fast as possible.
+  // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'.
+  return coord.first  < s_core_topology.first &&
+         coord.second < s_core_topology.second &&
+         0 == hwloc_set_cpubind( s_hwloc_topology ,
+                                 s_core[ coord.second + coord.first * s_core_topology.second ] ,
+                                 HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
+}
+
+bool unbind_this_thread()
+{
+  if ( ! sentinel() ) return false ;
+
+#define HWLOC_DEBUG_PRINT 0
+
+#if HWLOC_DEBUG_PRINT
+
+  std::cout << "Kokkos::unbind_this_thread() from " ;
+
+  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+#endif
+
+  const bool result =
+    s_hwloc_topology &&
+    0 == hwloc_set_cpubind( s_hwloc_topology ,
+                            s_process_binding ,
+                            HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
+
+#if HWLOC_DEBUG_PRINT
+
+  std::cout << " to " ;
+
+  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+  std::cout << std::endl ;
+
+#endif
+
+  return result ;
+
+#undef HWLOC_DEBUG_PRINT
+
+}
+
+//----------------------------------------------------------------------------
+
+std::pair<unsigned,unsigned> get_this_thread_coordinate()
+{
+  std::pair<unsigned,unsigned> coord(0u,0u);
+
+  if ( ! sentinel() ) return coord ;
+
+  const unsigned n = s_core_topology.first * s_core_topology.second ;
+
+  // Using the pre-allocated 's_hwloc_location' to avoid memory
+  // allocation by this thread.  This call is NOT thread-safe.
+  hwloc_get_last_cpu_location( s_hwloc_topology ,
+                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  unsigned i = 0 ;
+
+  while ( i < n && ! hwloc_bitmap_intersects( s_hwloc_location , s_core[ i ] ) ) ++i ;
+
+  if ( i < n ) {
+    coord.first  = i / s_core_topology.second ;
+    coord.second = i % s_core_topology.second ;
+  }
+
+  return coord ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else /* ! defined( KOKKOS_HAVE_HWLOC ) */
+
+namespace Kokkos {
+namespace hwloc {
+
+bool available() { return false ; }
+
+unsigned get_available_numa_count() { return 1 ; }
+unsigned get_available_cores_per_numa() { return 1 ; }
+unsigned get_available_threads_per_core() { return 1 ; }
+
+unsigned bind_this_thread( const unsigned , std::pair<unsigned,unsigned>[] )
+{ return ~0 ; }
+
+bool bind_this_thread( const std::pair<unsigned,unsigned> )
+{ return false ; }
+
+bool unbind_this_thread()
+{ return true ; }
+
+std::pair<unsigned,unsigned> get_this_thread_coordinate()
+{ return std::pair<unsigned,unsigned>(0,0); }
+
+} // namespace hwloc
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
@ -0,0 +1,80 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if ( KOKKOS_ENABLE_ASM )
+  #if defined( __arm__ )
+    /* No-operation instruction to idle the thread. */
+    #define YIELD   asm volatile("nop")
+  #else
+    /* Pause instruction to prevent excess processor bus usage */
+    #define YIELD   asm volatile("pause\n":::"memory")
+  #endif
+#elif defined( KOKKOS_HAVE_WINTHREAD )
+  #include <process.h>
+  #define YIELD  Sleep(0)
+#else
+  #include <sched.h>
+  #define YIELD  sched_yield()
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+void spinwait( volatile int & flag , const int value )
+{
+  while ( value == flag ) {
+    YIELD ;
+  }
+}
+#endif
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
@ -0,0 +1,64 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_SPINWAIT_HPP
+#define KOKKOS_SPINWAIT_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+void spinwait( volatile int & flag , const int value );
+#else
+KOKKOS_INLINE_FUNCTION
+void spinwait( volatile int & , const int ) {}
+#endif
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_SPINWAIT_HPP */
+