git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@13583 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2015-07-14 19:57:25 +00:00
parent 47e13d72a8
commit 61ec2f78ac
195 changed files with 68433 additions and 0 deletions
--- a/lib/kokkos/core/perf_test/Makefile
+++ b/lib/kokkos/core/perf_test/Makefile
@ -0,0 +1,66 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/core/perf_test
+
+default: build_all
+	echo "End Build"
+	
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = nvcc_wrapper
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?=  -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test
+
+TEST_TARGETS = 
+TARGETS = 
+
+OBJ_PERF = PerfTestHost.o PerfTestCuda.o PerfTestMain.o gtest-all.o
+TARGETS += KokkosCore_PerformanceTest
+TEST_TARGETS += test-performance
+
+OBJ_ATOMICS = test_atomic.o 
+TARGETS += KokkosCore_PerformanceTest_Atomics
+TEST_TARGETS += test-atomic
+
+
+KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest
+
+KokkosCore_PerformanceTest_Atomics: $(OBJ_ATOMICS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ATOMICS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_Atomics
+
+test-performance: KokkosCore_PerformanceTest
+	./KokkosCore_PerformanceTest
+
+test-atomic: KokkosCore_PerformanceTest_Atomics
+	./KokkosCore_PerformanceTest_Atomics
+	
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+	
+clean: kokkos-clean 
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
--- a/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp
@ -0,0 +1,309 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BLAS_KERNELS_HPP
+#define KOKKOS_BLAS_KERNELS_HPP
+
+namespace Kokkos {
+
+template< class ConstVectorType ,
+          class Device = typename ConstVectorType::execution_space >
+struct Dot ;
+
+template< class ConstVectorType ,
+          class Device = typename ConstVectorType::execution_space >
+struct DotSingle ;
+
+template< class ConstScalarType ,
+          class VectorType ,
+          class Device = typename VectorType::execution_space >
+struct Scale ;
+
+template< class ConstScalarType ,
+          class ConstVectorType ,
+          class VectorType ,
+          class Device = typename VectorType::execution_space >
+struct AXPBY ;
+
+/** \brief  Y = alpha * X + beta * Y */
+template< class ConstScalarType ,
+          class ConstVectorType ,
+          class      VectorType >
+void axpby( const ConstScalarType & alpha ,
+            const ConstVectorType & X ,
+            const ConstScalarType & beta ,
+            const      VectorType & Y )
+{
+  typedef AXPBY< ConstScalarType , ConstVectorType , VectorType > functor ;
+
+  parallel_for( Y.dimension_0() , functor( alpha , X , beta , Y ) );
+}
+
+/** \brief  Y *= alpha */
+template< class ConstScalarType ,
+          class      VectorType >
+void scale( const ConstScalarType & alpha , const VectorType & Y )
+{
+  typedef Scale< ConstScalarType , VectorType > functor ;
+
+  parallel_for( Y.dimension_0() , functor( alpha , Y ) );
+}
+
+template< class ConstVectorType ,
+          class Finalize >
+void dot( const ConstVectorType & X ,
+          const ConstVectorType & Y ,
+          const Finalize & finalize )
+{
+  typedef Dot< ConstVectorType >  functor ;
+
+  parallel_reduce( X.dimension_0() , functor( X , Y ) , finalize );
+}
+
+template< class ConstVectorType ,
+          class Finalize >
+void dot( const ConstVectorType & X ,
+          const Finalize & finalize )
+{
+  typedef DotSingle< ConstVectorType >  functor ;
+
+  parallel_reduce( X.dimension_0() , functor( X ) , finalize );
+}
+
+} /* namespace Kokkos */
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class Type , class Device >
+struct Dot
+{
+  typedef typename Device::execution_space execution_space ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< Type::Rank > >::type ok_rank ;
+
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename Type::execution_space >::type ok_device ;*/
+
+  typedef double value_type ;
+
+#if 1
+  typename Type::const_type X ;
+  typename Type::const_type Y ;
+#else
+  Type X ;
+  Type Y ;
+#endif
+
+  Dot( const Type & arg_x , const Type & arg_y )
+    : X(arg_x) , Y(arg_y) { }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , value_type & update ) const
+    { update += X[i] * Y[i]; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+    { update += source; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+};
+
+template< class Type , class Device >
+struct DotSingle
+{
+  typedef typename Device::execution_space execution_space ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< Type::Rank > >::type ok_rank ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename Type::execution_space >::type ok_device ;*/
+
+  typedef double value_type ;
+
+#if 1
+  typename Type::const_type X ;
+#else
+  Type X ;
+#endif
+
+  DotSingle( const Type & arg_x ) : X(arg_x) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , value_type & update ) const
+    {
+      const typename Type::value_type & x = X[i]; update += x * x ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+    { update += source; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+};
+
+
+template< class ScalarType , class VectorType , class Device>
+struct Scale
+{
+  typedef typename Device::execution_space execution_space ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ScalarType::execution_space >::type
+      ok_scalar_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename VectorType::execution_space >::type
+      ok_vector_device ;*/
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
+                            Impl::unsigned_< ScalarType::Rank > >::type
+      ok_scalar_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< VectorType::Rank > >::type
+      ok_vector_rank ;
+
+#if 1
+  typename ScalarType::const_type alpha ;
+#else
+  ScalarType alpha ;
+#endif
+
+  VectorType Y ;
+
+  Scale( const ScalarType & arg_alpha , const VectorType & arg_Y )
+    : alpha( arg_alpha ), Y( arg_Y ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const
+    {
+      Y[i] *= alpha() ;
+    }
+};
+
+
+template< class ScalarType ,
+          class ConstVectorType ,
+          class VectorType,
+          class Device>
+struct AXPBY
+{
+  typedef typename Device::execution_space execution_space ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ScalarType::execution_space >::type
+      ok_scalar_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ConstVectorType::execution_space >::type
+      ok_const_vector_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename VectorType::execution_space >::type
+      ok_vector_device ;*/
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
+                            Impl::unsigned_< ScalarType::Rank > >::type
+      ok_scalar_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< ConstVectorType::Rank > >::type
+      ok_const_vector_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< VectorType::Rank > >::type
+      ok_vector_rank ;
+
+#if 1
+  typename ScalarType::const_type alpha , beta ;
+  typename ConstVectorType::const_type X ;
+#else
+  ScalarType alpha , beta ;
+  ConstVectorType X ;
+#endif
+
+  VectorType Y ;
+
+  AXPBY( const ScalarType      & arg_alpha ,
+         const ConstVectorType & arg_X ,
+         const ScalarType      & arg_beta ,
+         const VectorType      & arg_Y )
+    : alpha( arg_alpha ), beta( arg_beta ), X( arg_X ), Y( arg_Y ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const
+    {
+      Y[i] = alpha() * X[i] + beta() * Y[i] ;
+    }
+};
+
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_BLAS_KERNELS_HPP */
--- a/lib/kokkos/core/perf_test/PerfTestCuda.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
@ -0,0 +1,189 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <iomanip>
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <PerfTestHexGrad.hpp>
+#include <PerfTestBlasKernels.hpp>
+#include <PerfTestGramSchmidt.hpp>
+#include <PerfTestDriver.hpp>
+
+
+namespace Test {
+
+class cuda : public ::testing::Test {
+  protected:
+    static void SetUpTestCase() {
+      Kokkos::HostSpace::execution_space::initialize();
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+    }
+    static void TearDownTestCase() {
+      Kokkos::Cuda::finalize();
+      Kokkos::HostSpace::execution_space::finalize();
+    }
+};
+
+TEST_F( cuda, hexgrad )
+{
+  EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
+}
+
+TEST_F( cuda, gramschmidt )
+{
+  EXPECT_NO_THROW( run_test_gramschmidt< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
+}
+
+namespace {
+
+template <typename T>
+struct TextureFetch
+{
+  typedef Kokkos::View< T *, Kokkos::CudaSpace> array_type;
+  typedef Kokkos::View< const T *, Kokkos::CudaSpace, Kokkos::MemoryRandomAccess> const_array_type;
+  typedef Kokkos::View< int *, Kokkos::CudaSpace> index_array_type;
+  typedef Kokkos::View< const int *, Kokkos::CudaSpace> const_index_array_type;
+
+  struct FillArray
+  {
+    array_type m_array;
+    FillArray( const array_type & array )
+      : m_array(array)
+    {}
+
+    void apply() const
+    {
+      Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.size()), *this);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(int i) const { m_array(i) = i; }
+  };
+
+  struct RandomIndexes
+  {
+    index_array_type m_indexes;
+    typename index_array_type::HostMirror m_host_indexes;
+    RandomIndexes( const index_array_type & indexes)
+      : m_indexes(indexes)
+      , m_host_indexes(Kokkos::create_mirror(m_indexes))
+    {}
+
+    void apply() const
+    {
+      Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::HostSpace::execution_space,int>(0,m_host_indexes.size()), *this);
+      //random shuffle
+      Kokkos::HostSpace::execution_space::fence();
+      std::random_shuffle(m_host_indexes.ptr_on_device(), m_host_indexes.ptr_on_device() + m_host_indexes.size());
+      Kokkos::deep_copy(m_indexes,m_host_indexes);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(int i) const { m_host_indexes(i) = i; }
+  };
+
+  struct RandomReduce
+  {
+    const_array_type       m_array;
+    const_index_array_type m_indexes;
+    RandomReduce( const const_array_type & array, const const_index_array_type & indexes)
+      : m_array(array)
+      , m_indexes(indexes)
+    {}
+
+    void apply(T & reduce) const
+    {
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.size()), *this, reduce);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(int i, T & reduce) const
+    { reduce += m_array(m_indexes(i)); }
+  };
+
+  static void run(int size, double & reduce_time, T &reduce)
+  {
+    array_type array("array",size);
+    index_array_type indexes("indexes",size);
+
+    { FillArray f(array); f.apply(); }
+    { RandomIndexes f(indexes); f.apply(); }
+
+    Kokkos::Cuda::fence();
+
+    Kokkos::Impl::Timer timer;
+    for (int j=0; j<10; ++j) {
+      RandomReduce f(array,indexes);
+      f.apply(reduce);
+    }
+    Kokkos::Cuda::fence();
+    reduce_time = timer.seconds();
+  }
+};
+
+} // unnamed namespace
+
+TEST_F( cuda, texture_double )
+{
+  printf("Random reduce of double through texture fetch\n");
+  for (int i=1; i<=27; ++i) {
+    int size = 1<<i;
+    double time = 0;
+    double reduce = 0;
+    TextureFetch<double>::run(size,time,reduce);
+    printf("   time = %1.3e   size = 2^%d\n", time, i);
+  }
+}
+
+} // namespace Test
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+
--- a/lib/kokkos/core/perf_test/PerfTestDriver.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestDriver.hpp
@ -0,0 +1,152 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <string>
+
+// mfh 06 Jun 2013: This macro doesn't work like one might thing it
+// should.  It doesn't take the template parameter DeviceType and
+// print its actual type name; it just literally prints out
+// "DeviceType".  I've worked around this below without using the
+// macro, so I'm commenting out the macro to avoid compiler complaints
+// about an unused macro.
+
+// #define KOKKOS_MACRO_IMPL_TO_STRING( X ) #X
+// #define KOKKOS_MACRO_TO_STRING( X )  KOKKOS_MACRO_IMPL_TO_STRING( X )
+
+//------------------------------------------------------------------------
+
+namespace Test {
+
+enum { NUMBER_OF_TRIALS = 5 };
+
+
+
+template< class DeviceType >
+void run_test_hexgrad( int exp_beg , int exp_end, const char deviceTypeName[] )
+{
+  std::string label_hexgrad ;
+  label_hexgrad.append( "\"HexGrad< double , " );
+  // mfh 06 Jun 2013: This only appends "DeviceType" (literally) to
+  // the string, not the actual name of the device type.  Thus, I've
+  // modified the function to take the name of the device type.
+  //
+  //label_hexgrad.append( KOKKOS_MACRO_TO_STRING( DeviceType ) );
+  label_hexgrad.append( deviceTypeName );
+  label_hexgrad.append( " >\"" );
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    double min_seconds = 0.0 ;
+    double max_seconds = 0.0 ;
+    double avg_seconds = 0.0 ;
+
+    const int parallel_work_length = 1<<i;
+
+    for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) {
+      const double seconds = HexGrad< DeviceType >::test(parallel_work_length) ;
+
+      if ( 0 == j ) {
+        min_seconds = seconds ;
+        max_seconds = seconds ;
+      }
+      else {
+        if ( seconds < min_seconds ) min_seconds = seconds ;
+        if ( seconds > max_seconds ) max_seconds = seconds ;
+      }
+      avg_seconds += seconds ;
+    }
+    avg_seconds /= NUMBER_OF_TRIALS ;
+
+    std::cout << label_hexgrad
+      << " , " << parallel_work_length
+      << " , " << min_seconds
+      << " , " << ( min_seconds / parallel_work_length )
+      << std::endl ;
+  }
+}
+
+template< class DeviceType >
+void run_test_gramschmidt( int exp_beg , int exp_end, const char deviceTypeName[] )
+{
+  std::string label_gramschmidt ;
+  label_gramschmidt.append( "\"GramSchmidt< double , " );
+  // mfh 06 Jun 2013: This only appends "DeviceType" (literally) to
+  // the string, not the actual name of the device type.  Thus, I've
+  // modified the function to take the name of the device type.
+  //
+  //label_gramschmidt.append( KOKKOS_MACRO_TO_STRING( DeviceType ) );
+  label_gramschmidt.append( deviceTypeName );
+  label_gramschmidt.append( " >\"" );
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    double min_seconds = 0.0 ;
+    double max_seconds = 0.0 ;
+    double avg_seconds = 0.0 ;
+
+    const int parallel_work_length = 1<<i;
+
+    for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) {
+      const double seconds = ModifiedGramSchmidt< double , DeviceType >::test(parallel_work_length, 32 ) ;
+
+      if ( 0 == j ) {
+        min_seconds = seconds ;
+        max_seconds = seconds ;
+      }
+      else {
+        if ( seconds < min_seconds ) min_seconds = seconds ;
+        if ( seconds > max_seconds ) max_seconds = seconds ;
+      }
+      avg_seconds += seconds ;
+    }
+    avg_seconds /= NUMBER_OF_TRIALS ;
+
+    std::cout << label_gramschmidt
+      << " , " << parallel_work_length
+      << " , " << min_seconds
+      << " , " << ( min_seconds / parallel_work_length )
+      << std::endl ;
+  }
+}
+
+}
+
--- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
@ -0,0 +1,231 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cmath>
+#include <PerfTestBlasKernels.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+// Reduction   : result = dot( Q(:,j) , Q(:,j) );
+// PostProcess : R(j,j) = result ; inv = 1 / result ;
+template< class VectorView , class ValueView  >
+struct InvNorm2 : public Kokkos::DotSingle< VectorView > {
+
+  typedef typename Kokkos::DotSingle< VectorView >::value_type value_type ;
+
+  ValueView  Rjj ;
+  ValueView  inv ;
+
+  InvNorm2( const VectorView & argX ,
+            const ValueView  & argR ,
+            const ValueView  & argInv )
+    : Kokkos::DotSingle< VectorView >( argX )
+    , Rjj( argR )
+    , inv( argInv )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & result ) const
+  {
+    result = sqrt( result );
+    Rjj() = result ;
+    inv() = ( 0 < result ) ? 1.0 / result : 0 ;
+  }
+};
+
+template< class VectorView , class ValueView >
+inline
+void invnorm2( const VectorView & x ,
+               const ValueView  & r ,
+               const ValueView  & r_inv )
+{
+  Kokkos::parallel_reduce( x.dimension_0() , InvNorm2< VectorView , ValueView >( x , r , r_inv ) );
+}
+
+// PostProcess : tmp = - ( R(j,k) = result );
+template< class VectorView , class ValueView  >
+struct DotM : public Kokkos::Dot< VectorView > {
+
+  typedef typename Kokkos::Dot< VectorView >::value_type value_type ;
+
+  ValueView  Rjk ;
+  ValueView  tmp ;
+
+  DotM( const VectorView & argX ,
+        const VectorView & argY ,
+        const ValueView & argR ,
+        const ValueView & argTmp )
+    : Kokkos::Dot< VectorView >( argX , argY )
+    , Rjk( argR )
+    , tmp( argTmp )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & result ) const
+  {
+     Rjk()  = result ;
+     tmp()  = - result ;
+  }
+};
+
+template< class VectorView , class ValueView >
+inline
+void dot_neg( const VectorView & x ,
+              const VectorView & y ,
+              const ValueView  & r ,
+              const ValueView  & r_neg )
+{
+  Kokkos::parallel_reduce( x.dimension_0() , DotM< VectorView , ValueView >( x , y , r , r_neg ) );
+}
+
+
+template< typename Scalar , class DeviceType >
+struct ModifiedGramSchmidt
+{
+  typedef DeviceType  execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef Kokkos::View< Scalar** ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > multivector_type ;
+
+  typedef Kokkos::View< Scalar* ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > vector_type ;
+
+  typedef Kokkos::View< Scalar ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > value_view ;
+
+
+  multivector_type Q ;
+  multivector_type R ;
+
+  static double factorization( const multivector_type Q_ ,
+                               const multivector_type R_ )
+  {
+#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+    using Kokkos::Experimental::ALL ;
+#else
+    const Kokkos::ALL ALL ;
+#endif
+    const size_type count  = Q_.dimension_1();
+    value_view tmp("tmp");
+    value_view one("one");
+
+    Kokkos::deep_copy( one , (Scalar) 1 );
+
+    Kokkos::Impl::Timer timer ;
+
+    for ( size_type j = 0 ; j < count ; ++j ) {
+      // Reduction   : tmp = dot( Q(:,j) , Q(:,j) );
+      // PostProcess : tmp = sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ;
+      const vector_type Qj  = Kokkos::subview( Q_ , ALL , j );
+      const value_view  Rjj = Kokkos::subview( R_ , j , j );
+
+      invnorm2( Qj , Rjj , tmp );
+
+      // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ;
+      Kokkos::scale( tmp , Qj );
+
+      for ( size_t k = j + 1 ; k < count ; ++k ) {
+        const vector_type Qk = Kokkos::subview( Q_ , ALL , k );
+        const value_view  Rjk = Kokkos::subview( R_ , j , k );
+
+        // Reduction   : R(j,k) = dot( Q(:,j) , Q(:,k) );
+        // PostProcess : tmp = - R(j,k);
+        dot_neg( Qj , Qk , Rjk , tmp );
+
+        // Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j)
+        Kokkos::axpby( tmp , Qj , one , Qk );
+      }
+    }
+
+    execution_space::fence();
+
+    return timer.seconds();
+  }
+
+  //--------------------------------------------------------------------------
+
+  static double test( const size_t length ,
+                      const size_t count ,
+                      const size_t iter = 1 )
+  {
+    multivector_type Q_( "Q" , length , count );
+    multivector_type R_( "R" , count , count );
+
+    typename multivector_type::HostMirror A =
+      Kokkos::create_mirror( Q_ );
+
+    // Create and fill A on the host
+
+    for ( size_type j = 0 ; j < count ; ++j ) {
+      for ( size_type i = 0 ; i < length ; ++i ) {
+        A(i,j) = ( i + 1 ) * ( j + 1 );
+      }
+    }
+
+    double dt_min = 0 ;
+
+    for ( size_t i = 0 ; i < iter ; ++i ) {
+
+      Kokkos::deep_copy( Q_ , A );
+
+      // A = Q * R
+
+      const double dt = factorization( Q_ , R_ );
+
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+    }
+
+    return dt_min ;
+  }
+};
+
+}
+
--- a/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
@ -0,0 +1,268 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+namespace Test {
+
+template< class DeviceType ,
+          typename CoordScalarType = double ,
+          typename GradScalarType  = float >
+struct HexGrad
+{
+  typedef DeviceType execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef HexGrad<DeviceType,CoordScalarType,GradScalarType> self_type;
+
+  // 3D array : ( ParallelWork , Space , Node )
+
+  enum { NSpace = 3 , NNode = 8 };
+
+  typedef Kokkos::View< CoordScalarType*[NSpace][NNode] , execution_space >
+    elem_coord_type ;
+
+  typedef Kokkos::View< GradScalarType*[NSpace][NNode] , execution_space >
+    elem_grad_type ;
+
+  elem_coord_type  coords ;
+  elem_grad_type   grad_op ;
+
+  enum { FLOPS  = 318 }; // = 3 * ( 18 + 8 * 11 ) };
+  enum { READS  = 18 };
+  enum { WRITES = 18 };
+
+  HexGrad( const elem_coord_type  & arg_coords ,
+           const elem_grad_type   & arg_grad_op )
+    : coords( arg_coords )
+    , grad_op( arg_grad_op )
+    {}
+
+  KOKKOS_INLINE_FUNCTION static
+  void grad( const CoordScalarType x[] ,
+             const CoordScalarType z[] ,
+                   GradScalarType grad_y[] )
+  {
+    const GradScalarType R42=(x[3] - x[1]);
+    const GradScalarType R52=(x[4] - x[1]);
+    const GradScalarType R54=(x[4] - x[3]);
+
+    const GradScalarType R63=(x[5] - x[2]);
+    const GradScalarType R83=(x[7] - x[2]);
+    const GradScalarType R86=(x[7] - x[5]);
+
+    const GradScalarType R31=(x[2] - x[0]);
+    const GradScalarType R61=(x[5] - x[0]);
+    const GradScalarType R74=(x[6] - x[3]);
+
+    const GradScalarType R72=(x[6] - x[1]);
+    const GradScalarType R75=(x[6] - x[4]);
+    const GradScalarType R81=(x[7] - x[0]);
+
+    const GradScalarType t1=(R63 + R54);
+    const GradScalarType t2=(R61 + R74);
+    const GradScalarType t3=(R72 + R81);
+
+    const GradScalarType t4 =(R86 + R42);
+    const GradScalarType t5 =(R83 + R52);
+    const GradScalarType t6 =(R75 + R31);
+
+    //  Calculate Y gradient from X and Z data
+
+    grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
+    grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
+    grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
+    grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
+    grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
+    grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
+    grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
+    grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type ielem ) const
+  {
+    GradScalarType g[NNode] ;
+
+    const CoordScalarType x[NNode] = {
+      coords(ielem,0,0),
+      coords(ielem,0,1),
+      coords(ielem,0,2),
+      coords(ielem,0,3),
+      coords(ielem,0,4),
+      coords(ielem,0,5),
+      coords(ielem,0,6),
+      coords(ielem,0,7)
+    };
+
+    const CoordScalarType y[NNode] = {
+      coords(ielem,1,0),
+      coords(ielem,1,1),
+      coords(ielem,1,2),
+      coords(ielem,1,3),
+      coords(ielem,1,4),
+      coords(ielem,1,5),
+      coords(ielem,1,6),
+      coords(ielem,1,7)
+    };
+
+    const CoordScalarType z[NNode] = {
+      coords(ielem,2,0),
+      coords(ielem,2,1),
+      coords(ielem,2,2),
+      coords(ielem,2,3),
+      coords(ielem,2,4),
+      coords(ielem,2,5),
+      coords(ielem,2,6),
+      coords(ielem,2,7)
+    };
+
+    grad( z , y , g );
+
+    grad_op(ielem,0,0) = g[0];
+    grad_op(ielem,0,1) = g[1];
+    grad_op(ielem,0,2) = g[2];
+    grad_op(ielem,0,3) = g[3];
+    grad_op(ielem,0,4) = g[4];
+    grad_op(ielem,0,5) = g[5];
+    grad_op(ielem,0,6) = g[6];
+    grad_op(ielem,0,7) = g[7];
+
+    grad( x , z , g );
+
+    grad_op(ielem,1,0) = g[0];
+    grad_op(ielem,1,1) = g[1];
+    grad_op(ielem,1,2) = g[2];
+    grad_op(ielem,1,3) = g[3];
+    grad_op(ielem,1,4) = g[4];
+    grad_op(ielem,1,5) = g[5];
+    grad_op(ielem,1,6) = g[6];
+    grad_op(ielem,1,7) = g[7];
+
+    grad( y , x , g );
+
+    grad_op(ielem,2,0) = g[0];
+    grad_op(ielem,2,1) = g[1];
+    grad_op(ielem,2,2) = g[2];
+    grad_op(ielem,2,3) = g[3];
+    grad_op(ielem,2,4) = g[4];
+    grad_op(ielem,2,5) = g[5];
+    grad_op(ielem,2,6) = g[6];
+    grad_op(ielem,2,7) = g[7];
+  }
+
+  //--------------------------------------------------------------------------
+
+  struct Init {
+    typedef typename self_type::execution_space execution_space ;
+
+    elem_coord_type coords ;
+
+    Init( const elem_coord_type & arg_coords )
+      : coords( arg_coords ) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( size_type ielem ) const
+    {
+      coords(ielem,0,0) = 0.;
+      coords(ielem,1,0) = 0.;
+      coords(ielem,2,0) = 0.;
+
+      coords(ielem,0,1) = 1.;
+      coords(ielem,1,1) = 0.;
+      coords(ielem,2,1) = 0.;
+
+      coords(ielem,0,2) = 1.;
+      coords(ielem,1,2) = 1.;
+      coords(ielem,2,2) = 0.;
+
+      coords(ielem,0,3) = 0.;
+      coords(ielem,1,3) = 1.;
+      coords(ielem,2,3) = 0.;
+
+
+      coords(ielem,0,4) = 0.;
+      coords(ielem,1,4) = 0.;
+      coords(ielem,2,4) = 1.;
+
+      coords(ielem,0,5) = 1.;
+      coords(ielem,1,5) = 0.;
+      coords(ielem,2,5) = 1.;
+
+      coords(ielem,0,6) = 1.;
+      coords(ielem,1,6) = 1.;
+      coords(ielem,2,6) = 1.;
+
+      coords(ielem,0,7) = 0.;
+      coords(ielem,1,7) = 1.;
+      coords(ielem,2,7) = 1.;
+    }
+  };
+
+  //--------------------------------------------------------------------------
+
+  static double test( const int count , const int iter = 1 )
+  {
+    elem_coord_type coord( "coord" , count );
+    elem_grad_type  grad ( "grad" , count );
+
+    // Execute the parallel kernels on the arrays:
+
+    double dt_min = 0 ;
+
+    Kokkos::parallel_for( count , Init( coord ) );
+    execution_space::fence();
+
+    for ( int i = 0 ; i < iter ; ++i ) {
+      Kokkos::Impl::Timer timer ;
+      Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+    }
+
+    return dt_min ;
+  }
+};
+
+}
+
--- a/lib/kokkos/core/perf_test/PerfTestHost.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestHost.cpp
@ -0,0 +1,104 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_OPENMP )
+
+typedef Kokkos::OpenMP TestHostDevice ;
+const char TestHostDeviceName[] = "Kokkos::OpenMP" ;
+
+#elif defined( KOKKOS_HAVE_PTHREAD )
+
+typedef Kokkos::Threads TestHostDevice ;
+const char TestHostDeviceName[] = "Kokkos::Threads" ;
+
+#elif defined( KOKKOS_HAVE_SERIAL )
+
+typedef Kokkos::Serial TestHostDevice ;
+const char TestHostDeviceName[] = "Kokkos::Serial" ;
+
+#else
+#  error "You must enable at least one of the following execution spaces in order to build this test: Kokkos::Threads, Kokkos::OpenMP, or Kokkos::Serial."
+#endif
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <PerfTestHexGrad.hpp>
+#include <PerfTestBlasKernels.hpp>
+#include <PerfTestGramSchmidt.hpp>
+#include <PerfTestDriver.hpp>
+
+//------------------------------------------------------------------------
+
+namespace Test {
+
+class host : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned team_count = Kokkos::hwloc::get_available_numa_count();
+    const unsigned threads_per_team = 4 ;
+
+    TestHostDevice::initialize( team_count * threads_per_team );
+  }
+
+  static void TearDownTestCase()
+  {
+    TestHostDevice::finalize();
+  }
+};
+
+TEST_F( host, hexgrad ) {
+  EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName ));
+}
+
+TEST_F( host, gramschmidt ) {
+  EXPECT_NO_THROW(run_test_gramschmidt< TestHostDevice>( 10, 20, TestHostDeviceName ));
+}
+
+} // namespace Test
+
+
--- a/lib/kokkos/core/perf_test/PerfTestMain.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestMain.cpp
@ -0,0 +1,49 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
--- a/lib/kokkos/core/perf_test/test_atomic.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@ -0,0 +1,504 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+typedef Kokkos::DefaultExecutionSpace exec_space;
+
+#define RESET		0
+#define BRIGHT 		1
+#define DIM		2
+#define UNDERLINE 	3
+#define BLINK		4
+#define REVERSE		7
+#define HIDDEN		8
+
+#define BLACK 		0
+#define RED		1
+#define GREEN		2
+#define YELLOW		3
+#define BLUE		4
+#define MAGENTA		5
+#define CYAN		6
+#define GREY		7
+#define	WHITE		8
+
+void textcolor(int attr, int fg, int bg)
+{	char command[13];
+
+	/* Command is the control command to the terminal */
+	sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
+	printf("%s", command);
+}
+void textcolor_standard() {textcolor(RESET, BLACK, WHITE);}
+
+
+template<class T,class DEVICE_TYPE>
+struct ZeroFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<T,execution_space> type;
+  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  type data;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    data() = 0;
+  }
+};
+
+//---------------------------------------------------
+//--------------atomic_fetch_add---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct AddFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    Kokkos::atomic_fetch_add(&data(),(T)1);
+  }
+};
+
+template<class T>
+T AddLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct AddFunctor<T,exec_space> f_add;
+  f_add.data = data;
+  Kokkos::parallel_for(loop,f_add);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct AddNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    data()+=(T)1;
+  }
+};
+
+template<class T>
+T AddLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct AddNonAtomicFunctor<T,exec_space> f_add;
+  f_add.data = data;
+  Kokkos::parallel_for(loop,f_add);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T>
+T AddLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++)
+  *data+=(T)1;
+
+  T val = *data;
+  delete data;
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct CASFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	  T old = data();
+	  T newval, assumed;
+	  do {
+	    assumed = old;
+	    newval = assumed + (T)1;
+	    old = Kokkos::atomic_compare_exchange(&data(), assumed, newval);
+	  }
+	  while( old != assumed );
+  }
+};
+
+template<class T>
+T CASLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct CASFunctor<T,exec_space> f_cas;
+  f_cas.data = data;
+  Kokkos::parallel_for(loop,f_cas);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct CASNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	  volatile T assumed;
+	  volatile T newval;
+	  bool fail=1;
+	  do {
+	    assumed = data();
+	    newval = assumed + (T)1;
+	    if(data()==assumed) {
+	    	data() = newval;
+	    	fail = 0;
+	    }
+	  }
+	  while(fail);
+  }
+};
+
+template<class T>
+T CASLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct CASNonAtomicFunctor<T,exec_space> f_cas;
+  f_cas.data = data;
+  Kokkos::parallel_for(loop,f_cas);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T>
+T CASLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++) {
+	  T assumed;
+	  T newval;
+	  T old;
+	  do {
+	    assumed = *data;
+	    newval = assumed + (T)1;
+	    old = *data;
+	    *data = newval;
+	  }
+	  while(!(assumed==old));
+  }
+
+  T val = *data;
+  delete data;
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ExchFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	T old = Kokkos::atomic_exchange(&data(),(T)i);
+    Kokkos::atomic_fetch_add(&data2(),old);
+  }
+};
+
+template<class T>
+T ExchLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  typename ZeroFunctor<T,exec_space>::type data2("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
+  f_zero.data = data2;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct ExchFunctor<T,exec_space> f_exch;
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for(loop,f_exch);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy(h_data2,data2);
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ExchNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+		T old = data();
+		data()=(T) i;
+		data2()+=old;
+  }
+};
+
+
+template<class T>
+T ExchLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  typename ZeroFunctor<T,exec_space>::type data2("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
+  f_zero.data = data2;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct ExchNonAtomicFunctor<T,exec_space> f_exch;
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for(loop,f_exch);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy(h_data2,data2);
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template<class T>
+T ExchLoopSerial(int loop) {
+  T* data = new T[1];
+  T* data2 = new T[1];
+  data[0] = 0;
+  data2[0] = 0;
+  for(int i=0;i<loop;i++) {
+	T old = *data;
+	*data=(T) i;
+	*data2+=old;
+  }
+
+  T val = *data2 + *data;
+  delete data;
+  delete data2;
+  return val;
+}
+
+template<class T>
+T LoopVariant(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoop<T>(loop);
+    case 2: return CASLoop<T>(loop);
+    case 3: return ExchLoop<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+T LoopVariantSerial(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoopSerial<T>(loop);
+    case 2: return CASLoopSerial<T>(loop);
+    case 3: return ExchLoopSerial<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+T LoopVariantNonAtomic(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoopNonAtomic<T>(loop);
+    case 2: return CASLoopNonAtomic<T>(loop);
+    case 3: return ExchLoopNonAtomic<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+void Loop(int loop, int test, const char* type_name) {
+  LoopVariant<T>(loop,test);
+
+  Kokkos::Impl::Timer timer;
+  T res = LoopVariant<T>(loop,test);
+  double time1 = timer.seconds();
+
+  timer.reset();
+  T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
+  double time2 = timer.seconds();
+
+  timer.reset();
+  T resSerial = LoopVariantSerial<T>(loop,test);
+  double time3 = timer.seconds();
+
+  time1*=1e6/loop;
+  time2*=1e6/loop;
+  time3*=1e6/loop;
+  //textcolor_standard();
+  bool passed = true;
+  if(resSerial!=res) passed = false;
+  //if(!passed) textcolor(RESET,BLACK,YELLOW);
+  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T));
+  //if(!passed) textcolor_standard();
+  printf("\n");
+}
+
+
+template<class T>
+void Test(int loop, int test, const char* type_name) {
+  if(test==-1) {
+    Loop<T>(loop,1,type_name);
+    Loop<T>(loop,2,type_name);
+    Loop<T>(loop,3,type_name);
+
+  }
+  else
+    Loop<T>(loop,test,type_name);
+}
+
+int main(int argc, char* argv[])
+{
+  int type = -1;
+  int loop = 1000000;
+  int test = -1;
+
+  for(int i=0;i<argc;i++)
+  {
+     if((strcmp(argv[i],"--test")==0)) {test=atoi(argv[++i]); continue;}
+     if((strcmp(argv[i],"--type")==0)) {type=atoi(argv[++i]); continue;}
+     if((strcmp(argv[i],"-l")==0)||(strcmp(argv[i],"--loop")==0)) {loop=atoi(argv[++i]); continue;}
+  }
+
+
+  Kokkos::initialize(argc,argv);
+
+
+  printf("Using %s\n",Kokkos::atomic_query_version());
+  bool all_tests = false;
+  if(type==-1) all_tests = true;
+  while(type<100) {
+    if(type==1) {
+     Test<int>(loop,test,"int                    ");
+    }
+    if(type==2) {
+     Test<long int>(loop,test,"long int               ");
+    }
+    if(type==3) {
+     Test<long long int>(loop,test,"long long int          ");
+    }
+    if(type==4) {
+     Test<unsigned int>(loop,test,"unsigned int           ");
+    }
+    if(type==5) {
+     Test<unsigned long int>(loop,test,"unsigned long int      ");
+    }
+    if(type==6) {
+     Test<unsigned long long int>(loop,test,"unsigned long long int ");
+    }
+    if(type==10) {
+     //Test<float>(loop,test,"float                  ");
+    }
+    if(type==11) {
+     Test<double>(loop,test,"double                 ");
+    }
+    if(!all_tests) type=100;
+    else type++;
+  }
+
+  Kokkos::finalize();
+
+}
+
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp
@ -0,0 +1,283 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
+#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( KOKKOS_HAVE_CUDA )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
+// Via reinterpret_case this can be used to support all scalar types of those sizes.
+// Any other scalar type falls back to either normal reads out of global memory,
+// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
+
+template< typename ValueType , typename AliasType >
+struct CudaTextureFetch {
+
+  ::cudaTextureObject_t   m_obj ;
+  const ValueType       * m_ptr ;
+  int                     m_offset ;
+
+  // Deference operator pulls through texture object and returns by value
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+    {
+#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+      AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
+      return  *(reinterpret_cast<ValueType*> (&v));
+#else
+      return m_ptr[ i ];
+#endif
+    }
+
+  // Pointer to referenced memory
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_ptr ; }
+
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaTextureFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs )
+    : m_obj(     rhs.m_obj )
+    , m_ptr(     rhs.m_ptr )
+    , m_offset(  rhs.m_offset )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( CudaTextureFetch && rhs )
+    : m_obj(     rhs.m_obj )
+    , m_ptr(     rhs.m_ptr )
+    , m_offset(  rhs.m_offset )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
+    {
+      m_obj     = rhs.m_obj ;
+      m_ptr     = rhs.m_ptr ;
+      m_offset  = rhs.m_offset ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( CudaTextureFetch && rhs )
+    {
+      m_obj     = rhs.m_obj ;
+      m_ptr     = rhs.m_ptr ;
+      m_offset  = rhs.m_offset ;
+      return *this ;
+    }
+
+  // Texture object spans the entire allocation.
+  // This handle may view a subset of the allocation, so an offset is required.
+  template< class CudaMemorySpace >
+  inline explicit
+  CudaTextureFetch( const ValueType * const arg_ptr
+                  , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
+                  )
+    // 'attach_texture_object' returns 0 when __CUDA_ARCH__ < 300
+    : m_obj( record.template attach_texture_object< AliasType >() )
+    , m_ptr( arg_ptr )
+    , m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
+    {}
+};
+
+#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
+
+template< typename ValueType , typename AliasType >
+struct CudaLDGFetch {
+
+  const ValueType * m_ptr ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+    {
+      AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_ptr[i]));
+      return  *(reinterpret_cast<ValueType*> (&v));
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_ptr ; }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch() : m_ptr() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaLDGFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch( const CudaLDGFetch & rhs )
+    : m_ptr( rhs.m_ptr )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch( CudaLDGFetch && rhs )
+    : m_ptr( rhs.m_ptr )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch & operator = ( const CudaLDGFetch & rhs )
+    {
+      m_ptr = rhs.m_ptr ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch & operator = ( CudaLDGFetch && rhs )
+    {
+      m_ptr = rhs.m_ptr ;
+      return *this ;
+    }
+
+  template< class CudaMemorySpace >
+  inline explicit
+  CudaTextureFetch( const ValueType * const arg_ptr
+                  , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
+                  )
+    : m_ptr( arg_data_ptr )
+    {}
+};
+
+#endif
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
+ *          if 'const' value type, CudaSpace and random access.
+ */
+template< class Traits >
+class ViewDataHandle< Traits ,
+  typename std::enable_if<(
+    // Is Cuda memory space
+    ( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
+      std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value )
+    &&
+    // Is a trivial const value of 4, 8, or 16 bytes
+    std::is_trivial<typename Traits::const_value_type>::value
+    &&
+    std::is_same<typename Traits::const_value_type,typename Traits::value_type>::value
+    &&
+    ( sizeof(typename Traits::const_value_type) ==  4 ||
+      sizeof(typename Traits::const_value_type) ==  8 ||
+      sizeof(typename Traits::const_value_type) == 16 )
+    &&
+    // Random access trait
+    ( Traits::memory_traits::RandomAccess != 0 )
+  )>::type >
+{
+public:
+
+  using track_type  = Kokkos::Experimental::Impl::SharedAllocationTracker ;
+
+  using value_type  = typename Traits::const_value_type ;
+  using return_type = typename Traits::const_value_type ; // NOT a reference
+
+  using alias_type = typename std::conditional< ( sizeof(value_type) ==  4 ) , int ,
+                     typename std::conditional< ( sizeof(value_type) ==  8 ) , ::int2 ,
+                     typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void
+                     >::type
+                     >::type
+                     >::type ;
+
+#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
+  using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ;
+#else
+  using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ;
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ )
+    {
+      return arg_handle ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Assignment of texture = non-texture requires creation of a texture object
+      // which can only occur on the host.  In addition, 'get_record' is only valid
+      // if called in a host execution space
+      return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
+#else
+      Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
+      return handle_type();
+#endif
+    }
+};
+
+}
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -0,0 +1,277 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDAEXEC_HPP
+#define KOKKOS_CUDAEXEC_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <string>
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaTraits {
+  enum { WarpSize       = 32      /* 0x0020 */ };
+  enum { WarpIndexMask  = 0x001f  /* Mask for warpindex */ };
+  enum { WarpIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
+
+  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
+  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
+  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
+
+  enum { UpperBoundGridCount    = 65535 /* Hard upper bound */ };
+  enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
+  enum { ConstantMemoryUsage    = 0x008000 /* 32k bytes */ };
+  enum { ConstantMemoryCache    = 0x002000 /*  8k bytes */ };
+
+  typedef unsigned long
+    ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
+
+  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_count( CudaSpace::size_type i )
+    { return ( i + WarpIndexMask ) >> WarpIndexShift ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_align( CudaSpace::size_type i )
+    {
+      enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
+      return ( i + WarpIndexMask ) & Mask ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+CudaSpace::size_type cuda_internal_maximum_warp_count();
+CudaSpace::size_type cuda_internal_maximum_grid_count();
+CudaSpace::size_type cuda_internal_maximum_shared_words();
+
+CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ )
+
+/** \brief  Access to constant memory on the device */
+#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+extern
+#endif
+__device__ __constant__
+Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
+kokkos_impl_cuda_constant_memory_buffer ;
+
+__device__ __constant__
+int* kokkos_impl_cuda_atomic_lock_array ;
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
+
+namespace Kokkos {
+namespace Impl {
+__device__ inline
+bool lock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
+  return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1));
+}
+
+__device__ inline
+void unlock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
+  atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0);
+}
+
+}
+}
+
+template< typename T >
+inline
+__device__
+T * kokkos_impl_cuda_shared_memory()
+{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize L1 cache and minimize shared memory:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
+// For 2.0 capability: 48 KB L1 and 16 KB shared
+//----------------------------------------------------------------------------
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template < class DriverType ,
+           bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
+struct CudaParallelLaunch ;
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType , true > {
+
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      else if ( shmem ) {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
+      } else {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
+      }
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
+
+      int* lock_array_ptr = lock_array_cuda_space_ptr();
+      cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      Kokkos::Cuda::fence();
+      CUDA_SAFE_CALL( cudaGetLastError() );
+#endif
+    }
+  }
+};
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType , false > {
+
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      else if ( shmem ) {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared );
+      } else {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 );
+      }
+
+      int* lock_array_ptr = lock_array_cuda_space_ptr();
+      cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+
+      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      Kokkos::Cuda::fence();
+      CUDA_SAFE_CALL( cudaGetLastError() );
+#endif
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* defined( __CUDACC__ ) */
+#endif /* defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -0,0 +1,670 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Kokkos_Cuda.hpp>
+#include <Kokkos_CudaSpace.hpp>
+
+#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+DeepCopy<CudaSpace,CudaSpace>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<CudaSpace,CudaSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+DeepCopy<HostSpace,CudaSpace>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<HostSpace,CudaSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+DeepCopy<CudaSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<CudaSpace,HostSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+
+namespace Kokkos {
+
+namespace {
+
+void texture_object_attach_impl(  Impl::AllocationTracker const & tracker
+                                , unsigned type_size
+                                , ::cudaChannelFormatDesc const & desc
+                               )
+{
+  enum { TEXTURE_BOUND_1D = 2u << 27 };
+
+  if ( tracker.attribute() == NULL ) {
+    // check for correct allocator
+    const bool ok_alloc =  tracker.allocator()->support_texture_binding();
+
+    const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D;
+
+    if (ok_alloc && ok_count) {
+      Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc );
+      tracker.set_attribute( attr );
+    }
+    else {
+      std::ostringstream oss;
+      oss << "Error: Cannot attach texture object";
+      if (!ok_alloc) {
+        oss << ", incompatabile allocator " << tracker.allocator()->name();
+      }
+      if (!ok_count) {
+        oss << ", array " << tracker.label() << " too large";
+      }
+      oss << ".";
+      Kokkos::Impl::throw_runtime_exception( oss.str() );
+    }
+  }
+
+  if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) {
+    std::ostringstream oss;
+    oss << "Error: Allocation " << tracker.label() << " already has an attribute attached.";
+    Kokkos::Impl::throw_runtime_exception( oss.str() );
+  }
+
+}
+
+} // unnamed namespace
+
+/*--------------------------------------------------------------------------*/
+
+Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size )
+{
+  return Impl::AllocationTracker( allocator(), size, label);
+}
+
+void CudaSpace::texture_object_attach(  Impl::AllocationTracker const & tracker
+                                      , unsigned type_size
+                                      , ::cudaChannelFormatDesc const & desc
+                                     )
+{
+  texture_object_attach_impl( tracker, type_size, desc );
+}
+
+void CudaSpace::access_error()
+{
+  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+void CudaSpace::access_error( const void * const )
+{
+  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+/*--------------------------------------------------------------------------*/
+
+Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
+{
+  return Impl::AllocationTracker( allocator(), size, label);
+}
+
+void CudaUVMSpace::texture_object_attach(  Impl::AllocationTracker const & tracker
+                                         , unsigned type_size
+                                         , ::cudaChannelFormatDesc const & desc
+                                        )
+{
+  texture_object_attach_impl( tracker, type_size, desc );
+}
+
+bool CudaUVMSpace::available()
+{
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
+  enum { UVM_available = true };
+#else
+  enum { UVM_available = false };
+#endif
+  return UVM_available;
+}
+
+/*--------------------------------------------------------------------------*/
+
+Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
+{
+  return Impl::AllocationTracker( allocator(), size, label);
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+CudaSpace::CudaSpace()
+  : m_device( Kokkos::Cuda().cuda_device() )
+{
+}
+
+CudaUVMSpace::CudaUVMSpace()
+  : m_device( Kokkos::Cuda().cuda_device() )
+{
+}
+
+CudaHostPinnedSpace::CudaHostPinnedSpace()
+{
+}
+
+void * CudaSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaMalloc( &ptr, arg_alloc_size ) );
+
+  return ptr ;
+}
+
+void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
+
+  return ptr ;
+}
+
+void * CudaHostPinnedSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaHostAlloc( &ptr, arg_alloc_size , cudaHostAllocDefault ) );
+
+  return ptr ;
+}
+
+void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFreeHost( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record ;
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record ;
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record ;
+
+::cudaTextureObject_t
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+attach_texture_object( const unsigned sizeof_alias
+                     , void *   const alloc_ptr
+                     , size_t   const alloc_size )
+{
+  // Only valid for 300 <= __CUDA_ARCH__
+  // otherwise return zero.
+
+  ::cudaTextureObject_t tex_obj ;
+
+  struct cudaResourceDesc resDesc ;
+  struct cudaTextureDesc  texDesc ;
+
+  memset( & resDesc , 0 , sizeof(resDesc) );
+  memset( & texDesc , 0 , sizeof(texDesc) );
+
+  resDesc.resType                = cudaResourceTypeLinear ;
+  resDesc.res.linear.desc        = ( sizeof_alias ==  4 ?  cudaCreateChannelDesc< int >() :
+                                   ( sizeof_alias ==  8 ?  cudaCreateChannelDesc< ::int2 >() :
+                                  /* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4 >() ) );
+  resDesc.res.linear.sizeInBytes = alloc_size ;
+  resDesc.res.linear.devPtr      = alloc_ptr ;
+
+  CUDA_SAFE_CALL( cudaCreateTextureObject( & tex_obj , & resDesc, & texDesc, NULL ) );
+
+  return tex_obj ;
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaSpace , void >::get_label() const
+{
+  SharedAllocationHeader header ;
+
+  Kokkos::Impl::DeepCopy< Kokkos::HostSpace , Kokkos::CudaSpace >( & header , RecordBase::head() , sizeof(SharedAllocationHeader) );
+
+  return std::string( header.m_label );
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_label() const
+{
+  return std::string( RecordBase::head()->m_label );
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_label() const
+{
+  return std::string( RecordBase::head()->m_label );
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void > *
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+allocate( const Kokkos::CudaSpace &  arg_space
+        , const std::string       &  arg_label
+        , const size_t               arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void > *
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+allocate( const Kokkos::CudaUVMSpace &  arg_space
+        , const std::string          &  arg_label
+        , const size_t                  arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+allocate( const Kokkos::CudaHostPinnedSpace &  arg_space
+        , const std::string                 &  arg_label
+        , const size_t                         arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_tex_obj( 0 )
+  , m_space( arg_space )
+{
+  SharedAllocationHeader header ;
+
+  // Fill in the Header information
+  header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( header.m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+
+  // Copy to device memory
+  Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>::DeepCopy( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
+                      , const std::string          & arg_label
+                      , const size_t                 arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_tex_obj( 0 )
+  , m_space( arg_space )
+{
+  // Fill in the Header information, directly accessible via UVM
+
+  RecordBase::m_alloc_ptr->m_record = this ;
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
+                      , const std::string                 & arg_label
+                      , const size_t                        arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  // Fill in the Header information, directly accessible via UVM
+
+  RecordBase::m_alloc_ptr->m_record = this ;
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void > *
+SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordBase = SharedAllocationRecord< void , void > ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
+
+#if 0
+  // Copy the header from the allocation
+  SharedAllocationHeader head ;
+
+  SharedAllocationHeader const * const head_cuda = Header::get_header( alloc_ptr );
+
+  Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) );
+
+  RecordCuda * const record = static_cast< RecordCuda * >( head.m_record );
+
+  if ( record->m_alloc_ptr != head_cuda ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
+  }
+
+#else
+
+  // Iterate the list to search for the record among all allocations
+  // requires obtaining the root of the list and then locking the list.
+
+  RecordCuda * const record = static_cast< RecordCuda * >( RecordBase::find( & s_root_record , alloc_ptr ) );
+
+  if ( record == 0 ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
+  }
+
+#endif
+
+  return record ;
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void > *
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
+
+  Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ;
+
+  if ( h->m_record->m_alloc_ptr != h ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
+  }
+
+  return static_cast< RecordCuda * >( h->m_record );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > ;
+
+  Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ;
+
+  if ( h->m_record->m_alloc_ptr != h ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
+  }
+
+  return static_cast< RecordCuda * >( h->m_record );
+}
+
+// Iterate records to print orphaned memory ...
+void
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void > * r = & s_root_record ;
+
+  char buffer[256] ;
+
+  SharedAllocationHeader head ;
+
+  if ( detail ) {
+    do {
+      if ( r->m_alloc_ptr ) {
+        Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+      }
+      else {
+        head.m_label[0] = 0 ;
+      }
+
+      snprintf( buffer , 256 , "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"
+              , reinterpret_cast<unsigned long>( r )
+              , reinterpret_cast<unsigned long>( r->m_prev )
+              , reinterpret_cast<unsigned long>( r->m_next )
+              , reinterpret_cast<unsigned long>( r->m_alloc_ptr )
+              , r->m_alloc_size
+              , r->m_count
+              , reinterpret_cast<unsigned long>( r->m_dealloc )
+              , head.m_label
+              );
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+  else {
+    do {
+      if ( r->m_alloc_ptr ) {
+
+        Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+
+        snprintf( buffer , 256 , "Cuda [ 0x%.12lx + %ld ] %s\n"
+                , reinterpret_cast< unsigned long >( r->data() )
+                , r->size()
+                , head.m_label
+                );
+      }
+      else {
+        snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" );
+      }
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail );
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+  __global__ void init_lock_array_kernel() {
+    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(i<CUDA_SPACE_ATOMIC_MASK+1)
+      kokkos_impl_cuda_atomic_lock_array[i] = 0;
+  }
+}
+
+namespace Impl {
+int* lock_array_cuda_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    cudaFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1));
+  return ptr;
+}
+
+void init_lock_array_cuda_space() {
+  int is_initialized = 0;
+  if(! is_initialized) {
+    int* lock_array_ptr = lock_array_cuda_space_ptr();
+    cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+    init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
+  }
+}
+
+}
+}
+#endif // KOKKOS_HAVE_CUDA
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
@ -0,0 +1,183 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+#define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
+
+namespace Kokkos {
+namespace Impl {
+
+template< class DestructFunctor >
+SharedAllocationRecord *
+shared_allocation_record( Kokkos::CudaSpace const & arg_space
+                        , void *            const   arg_alloc_ptr
+                        , DestructFunctor   const & arg_destruct )
+{
+  SharedAllocationRecord * const record = SharedAllocationRecord::get_record( arg_alloc_ptr );
+
+  // assert: record != 0
+
+  // assert: sizeof(DestructFunctor) <= record->m_destruct_size
+
+  // assert: record->m_destruct_function == 0
+
+  DestructFunctor * const functor =
+    reinterpret_cast< DestructFunctor * >(
+    reinterpret_cast< unsigned long >( record ) + sizeof(SharedAllocationRecord) );
+
+  new( functor ) DestructFunctor( arg_destruct );
+
+  record->m_destruct_functor = & shared_allocation_destroy< DestructFunctor > ;
+  
+  return record ;
+}
+
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedUVMAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged UVM Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedHostAllocator
+/// does nothing when deallocate(ptr,size) is called
+class CudaUnmanagedHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Unmanaged Host Allocator";
+  }
+  // Unmanaged deallocate does nothing
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+};
+
+/// class CudaMallocAllocator
+class CudaMallocAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Malloc Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUVMAllocator
+class CudaUVMAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda UVM Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaHostAllocator
+class CudaHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Host Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_HAVE_CUDA
+
+#endif // #ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
@ -0,0 +1,192 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+#include <sstream>
+
+namespace Kokkos { namespace Impl {
+
+
+/*--------------------------------------------------------------------------*/
+TextureAttribute::TextureAttribute(  void * const alloc_ptr
+                                   , size_t alloc_size
+                                   , cudaChannelFormatDesc const & desc
+                                  )
+  : m_tex_obj(0)
+{
+  cuda_device_synchronize();
+
+  struct cudaResourceDesc resDesc ;
+  struct cudaTextureDesc  texDesc ;
+
+  memset( & resDesc , 0 , sizeof(resDesc) );
+  memset( & texDesc , 0 , sizeof(texDesc) );
+
+  resDesc.resType                = cudaResourceTypeLinear ;
+  resDesc.res.linear.desc        = desc ;
+  resDesc.res.linear.sizeInBytes = alloc_size ;
+  resDesc.res.linear.devPtr      = alloc_ptr ;
+
+  CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) );
+
+  cuda_device_synchronize();
+}
+
+
+TextureAttribute::~TextureAttribute()
+{
+  if (m_tex_obj) {
+    cudaDestroyTextureObject( m_tex_obj );
+  }
+}
+
+/*--------------------------------------------------------------------------*/
+
+void * CudaMallocAllocator::allocate( size_t size )
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) );
+
+  return ptr;
+}
+
+void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( ptr ) );
+  } catch(...) {}
+}
+
+void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  void * ptr = old_ptr;
+  if (old_size != new_size) {
+    ptr = allocate( new_size );
+    size_t copy_size = old_size < new_size ? old_size : new_size;
+
+    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
+
+    deallocate( old_ptr, old_size );
+  }
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------*/
+
+void * CudaUVMAllocator::allocate( size_t size )
+{
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
+  void * ptr = NULL;
+  CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) );
+  return ptr;
+#else
+  throw_runtime_exception( "CUDA VERSION does not support UVM" );
+  return NULL;
+#endif
+}
+
+void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ )
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( ptr ) );
+  } catch(...) {}
+}
+
+void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  void * ptr = old_ptr;
+  if (old_size != new_size) {
+    ptr = allocate( new_size );
+    size_t copy_size = old_size < new_size ? old_size : new_size;
+
+    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
+
+    deallocate( old_ptr, old_size );
+  }
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------*/
+
+void * CudaHostAllocator::allocate( size_t size )
+{
+  void * ptr = NULL;
+  CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) );
+  return ptr;
+}
+
+void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ )
+{
+  try {
+    CUDA_SAFE_CALL( cudaFreeHost( ptr ) );
+  } catch(...) {}
+}
+
+void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  void * ptr = old_ptr;
+  if (old_size != new_size) {
+    ptr = allocate( new_size );
+    size_t copy_size = old_size < new_size ? old_size : new_size;
+
+    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) );
+
+    deallocate( old_ptr, old_size );
+  }
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------*/
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_HAVE_CUDA
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
@ -0,0 +1,187 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
+#define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
+
+namespace Kokkos { namespace Impl {
+
+
+// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
+// to be an 'unsigned long long'.  This chould change with
+// future version of Cuda and this typedef would have to
+// change accordingly.
+
+#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
+
+typedef enable_if<
+  sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
+  ::cudaTextureObject_t >::type cuda_texture_object_type ;
+
+#else
+
+typedef const void * cuda_texture_object_type ;
+
+#endif
+
+
+struct TextureAttribute : public AllocatorAttributeBase
+{
+  cuda_texture_object_type m_tex_obj ;
+
+  TextureAttribute(  void * const alloc_ptr
+                   , size_t alloc_size
+                   , cudaChannelFormatDesc const & desc
+                  );
+
+  ~TextureAttribute();
+};
+
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedUVMAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged UVM Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedHostAllocator
+/// does nothing when deallocate(ptr,size) is called
+class CudaUnmanagedHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Unmanaged Host Allocator";
+  }
+  // Unmanaged deallocate does nothing
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+};
+
+/// class CudaMallocAllocator
+class CudaMallocAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Malloc Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUVMAllocator
+class CudaUVMAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda UVM Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaHostAllocator
+class CudaHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Host Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_HAVE_CUDA
+
+#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@ -0,0 +1,69 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ERROR_HPP
+#define KOKKOS_CUDA_ERROR_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+namespace Kokkos { namespace Impl {
+
+void cuda_device_synchronize();
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
+
+inline void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
+{
+  if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
+}
+
+#define CUDA_SAFE_CALL( call )  \
+	Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_HAVE_CUDA
+#endif //KOKKOS_CUDA_ERROR_HPP
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -0,0 +1,678 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+/* Kokkos interfaces */
+
+#include <Kokkos_Core.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/* Standard 'C' libraries */
+#include <stdlib.h>
+
+/* Standard 'C++' libraries */
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+__device__ __constant__
+Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
+kokkos_impl_cuda_constant_memory_buffer ;
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+namespace {
+
+__global__
+void query_cuda_kernel_arch( int * d_arch )
+{
+#if defined( __CUDA_ARCH__ )
+  *d_arch = __CUDA_ARCH__ ;
+#else
+  *d_arch = 0 ;
+#endif
+}
+
+/** Query what compute capability is actually launched to the device: */
+int cuda_kernel_arch()
+{
+  int * d_arch = 0 ;
+  cudaMalloc( (void **) & d_arch , sizeof(int) );
+  query_cuda_kernel_arch<<<1,1>>>( d_arch );
+  int arch = 0 ;
+  cudaMemcpy( & arch , d_arch , sizeof(int) , cudaMemcpyDefault );
+  cudaFree( d_arch );
+  return arch ;
+}
+
+bool cuda_launch_blocking()
+{
+  const char * env = getenv("CUDA_LAUNCH_BLOCKING");
+
+  if (env == 0) return false;
+
+  return atoi(env);
+}
+
+}
+
+void cuda_device_synchronize()
+{
+//  static const bool launch_blocking = cuda_launch_blocking();
+
+//  if (!launch_blocking) {
+    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+//  }
+}
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
+{
+  std::ostringstream out ;
+  out << name << " error( " << cudaGetErrorName(e) << "): " << cudaGetErrorString(e);
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw_runtime_exception( out.str() );
+}
+
+//----------------------------------------------------------------------------
+// Some significant cuda device properties:
+//
+// cudaDeviceProp::name                : Text label for device
+// cudaDeviceProp::major               : Device major number
+// cudaDeviceProp::minor               : Device minor number
+// cudaDeviceProp::warpSize            : number of threads per warp
+// cudaDeviceProp::multiProcessorCount : number of multiprocessors
+// cudaDeviceProp::sharedMemPerBlock   : capacity of shared memory per block
+// cudaDeviceProp::totalConstMem       : capacity of constant memory
+// cudaDeviceProp::totalGlobalMem      : capacity of global memory
+// cudaDeviceProp::maxGridSize[3]      : maximum grid size
+
+//
+//  Section 4.4.2.4 of the CUDA Toolkit Reference Manual
+//
+// struct cudaDeviceProp {
+//   char name[256];
+//   size_t totalGlobalMem;
+//   size_t sharedMemPerBlock;
+//   int regsPerBlock;
+//   int warpSize;
+//   size_t memPitch;
+//   int maxThreadsPerBlock;
+//   int maxThreadsDim[3];
+//   int maxGridSize[3];
+//   size_t totalConstMem;
+//   int major;
+//   int minor;
+//   int clockRate;
+//   size_t textureAlignment;
+//   int deviceOverlap;
+//   int multiProcessorCount;
+//   int kernelExecTimeoutEnabled;
+//   int integrated;
+//   int canMapHostMemory;
+//   int computeMode;
+//   int concurrentKernels;
+//   int ECCEnabled;
+//   int pciBusID;
+//   int pciDeviceID;
+//   int tccDriver;
+//   int asyncEngineCount;
+//   int unifiedAddressing;
+//   int memoryClockRate;
+//   int memoryBusWidth;
+//   int l2CacheSize;
+//   int maxThreadsPerMultiProcessor;
+// };
+
+
+namespace {
+
+
+
+class CudaInternalDevices {
+public:
+  enum { MAXIMUM_DEVICE_COUNT = 8 };
+  struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
+  int                    m_cudaDevCount ;
+
+  CudaInternalDevices();
+
+  static const CudaInternalDevices & singleton();
+};
+
+CudaInternalDevices::CudaInternalDevices()
+{
+  // See 'cudaSetDeviceFlags' for host-device thread interaction
+  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
+
+  CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
+
+  for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
+    CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
+  }
+}
+
+const CudaInternalDevices & CudaInternalDevices::singleton()
+{
+  static CudaInternalDevices self ; return self ;
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+class CudaInternal {
+private:
+
+  CudaInternal( const CudaInternal & );
+  CudaInternal & operator = ( const CudaInternal & );
+
+  AllocationTracker m_scratchFlagsTracker;
+  AllocationTracker m_scratchSpaceTracker;
+  AllocationTracker m_scratchUnifiedTracker;
+
+
+public:
+
+  typedef Cuda::size_type size_type ;
+
+  int         m_cudaDev ;
+  int         m_cudaArch ;
+  unsigned    m_maxWarpCount ;
+  unsigned    m_maxBlock ;
+  unsigned    m_maxSharedWords ;
+  size_type   m_scratchSpaceCount ;
+  size_type   m_scratchFlagsCount ;
+  size_type   m_scratchUnifiedCount ;
+  size_type   m_scratchUnifiedSupported ;
+  size_type   m_streamCount ;
+  size_type * m_scratchSpace ;
+  size_type * m_scratchFlags ;
+  size_type * m_scratchUnified ;
+  cudaStream_t * m_stream ;
+
+
+  static CudaInternal & singleton();
+
+  int verify_is_initialized( const char * const label ) const ;
+
+  int is_initialized() const
+    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+
+  void initialize( int cuda_device_id , int stream_count );
+  void finalize();
+
+  void print_configuration( std::ostream & ) const ;
+
+  ~CudaInternal();
+
+  CudaInternal()
+    : m_cudaDev( -1 )
+    , m_cudaArch( -1 )
+    , m_maxWarpCount( 0 )
+    , m_maxBlock( 0 )
+    , m_maxSharedWords( 0 )
+    , m_scratchSpaceCount( 0 )
+    , m_scratchFlagsCount( 0 )
+    , m_scratchUnifiedCount( 0 )
+    , m_scratchUnifiedSupported( 0 )
+    , m_streamCount( 0 )
+    , m_scratchSpace( 0 )
+    , m_scratchFlags( 0 )
+    , m_scratchUnified( 0 )
+    , m_stream( 0 )
+    {}
+
+  size_type * scratch_space( const size_type size );
+  size_type * scratch_flags( const size_type size );
+  size_type * scratch_unified( const size_type size );
+};
+
+//----------------------------------------------------------------------------
+
+
+void CudaInternal::print_configuration( std::ostream & s ) const
+{
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+#if defined( KOKKOS_HAVE_CUDA )
+    s << "macro  KOKKOS_HAVE_CUDA      : defined" << std::endl ;
+#endif
+#if defined( CUDA_VERSION )
+    s << "macro  CUDA_VERSION          = " << CUDA_VERSION
+      << " = version " << CUDA_VERSION / 1000
+      << "." << ( CUDA_VERSION % 1000 ) / 10
+      << std::endl ;
+#endif
+
+  for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) {
+    s << "Kokkos::Cuda[ " << i << " ] "
+      << dev_info.m_cudaProp[i].name
+      << " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor
+      << ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem)
+      << ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
+    if ( m_cudaDev == i ) s << " : Selected" ;
+    s << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+CudaInternal::~CudaInternal()
+{
+  if ( m_stream ||
+       m_scratchSpace ||
+       m_scratchFlags ||
+       m_scratchUnified ) {
+    std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
+              << std::endl ;
+    std::cerr.flush();
+  }
+
+  m_cudaDev                 = -1 ;
+  m_cudaArch                = -1 ;
+  m_maxWarpCount            = 0 ;
+  m_maxBlock                = 0 ;
+  m_maxSharedWords          = 0 ;
+  m_scratchSpaceCount       = 0 ;
+  m_scratchFlagsCount       = 0 ;
+  m_scratchUnifiedCount     = 0 ;
+  m_scratchUnifiedSupported = 0 ;
+  m_streamCount             = 0 ;
+  m_scratchSpace            = 0 ;
+  m_scratchFlags            = 0 ;
+  m_scratchUnified          = 0 ;
+  m_stream                  = 0 ;
+}
+
+int CudaInternal::verify_is_initialized( const char * const label ) const
+{
+  if ( m_cudaDev < 0 ) {
+    std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" << std::endl ;
+  }
+  return 0 <= m_cudaDev ;
+}
+
+CudaInternal & CudaInternal::singleton()
+{
+  static CudaInternal self ;
+  return self ;
+}
+
+void CudaInternal::initialize( int cuda_device_id , int stream_count )
+{
+  enum { WordSize = sizeof(size_type) };
+
+  if ( ! HostSpace::execution_space::is_initialized() ) {
+    const std::string msg("Cuda::initialize ERROR : HostSpace::execution_space is not initialized");
+    throw_runtime_exception( msg );
+  }
+
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
+
+  const bool ok_id   = 0 <= cuda_device_id &&
+                            cuda_device_id < dev_info.m_cudaDevCount ;
+
+  // Need device capability 2.0 or better
+
+  const bool ok_dev = ok_id &&
+    ( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
+      0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
+
+  if ( ok_init && ok_dev ) {
+
+    const struct cudaDeviceProp & cudaProp =
+      dev_info.m_cudaProp[ cuda_device_id ];
+
+    m_cudaDev = cuda_device_id ;
+
+    CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) );
+    CUDA_SAFE_CALL( cudaDeviceReset() );
+    Kokkos::Impl::cuda_device_synchronize();
+
+    // Query what compute capability architecture a kernel executes:
+    m_cudaArch = cuda_kernel_arch();
+
+    if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) {
+      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
+                << ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 )
+                << " on device with compute capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " , this will likely reduce potential performance."
+                << std::endl ;
+    }
+
+    //----------------------------------
+    // Maximum number of warps,
+    // at most one warp per thread in a warp for reduction.
+
+    // HCE 2012-February :
+    // Found bug in CUDA 4.1 that sometimes a kernel launch would fail
+    // if the thread count == 1024 and a functor is passed to the kernel.
+    // Copying the kernel to constant memory and then launching with
+    // thread count == 1024 would work fine.
+    //
+    // HCE 2012-October :
+    // All compute capabilities support at least 16 warps (512 threads).
+    // However, we have found that 8 warps typically gives better performance.
+
+    m_maxWarpCount = 8 ;
+
+    // m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
+
+    if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) {
+      m_maxWarpCount = Impl::CudaTraits::WarpSize ;
+    }
+
+    m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ;
+
+    //----------------------------------
+    // Maximum number of blocks:
+
+    m_maxBlock = m_cudaArch < 300 ? 65535 : cudaProp.maxGridSize[0] ;
+
+    //----------------------------------
+
+    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
+
+    if ( ! m_scratchUnifiedSupported ) {
+      std::cout << "Kokkos::Cuda device "
+                << cudaProp.name << " capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " does not support unified virtual address space"
+                << std::endl ;
+    }
+
+    //----------------------------------
+    // Multiblock reduction uses scratch flags for counters
+    // and scratch space for partial reduction values.
+    // Allocate some initial space.  This will grow as needed.
+
+    {
+      const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ;
+
+      (void) scratch_unified( 16 * sizeof(size_type) );
+      (void) scratch_flags( reduce_block_count * 2  * sizeof(size_type) );
+      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
+    }
+    //----------------------------------
+
+    if ( stream_count ) {
+      m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
+      m_streamCount = stream_count ;
+      for ( size_type i = 0 ; i < m_streamCount ; ++i ) m_stream[i] = 0 ;
+    }
+  }
+  else {
+
+    std::ostringstream msg ;
+    msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ;
+
+    if ( ! ok_init ) {
+      msg << " : Already initialized" ;
+    }
+    if ( ! ok_id ) {
+      msg << " : Device identifier out of range "
+          << "[0.." << dev_info.m_cudaDevCount << "]" ;
+    }
+    else if ( ! ok_dev ) {
+      msg << " : Device " ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
+      msg << "." ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
+      msg << " has insufficient capability, required 2.0 or better" ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_cuda_space();
+
+}
+
+//----------------------------------------------------------------------------
+
+typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ;
+enum { sizeScratchGrain = sizeof(ScratchGrain) };
+
+
+Cuda::size_type *
+CudaInternal::scratch_flags( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
+
+
+    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
+    m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
+
+    CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
+  }
+
+  return m_scratchFlags ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_space( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
+
+    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
+    m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
+
+  }
+
+  return m_scratchSpace ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_unified( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_unified") &&
+       m_scratchUnifiedSupported && m_scratchUnifiedCount * sizeScratchGrain < size ) {
+
+    m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
+    m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
+  }
+
+  return m_scratchUnified ;
+}
+
+//----------------------------------------------------------------------------
+
+void CudaInternal::finalize()
+{
+  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
+
+    lock_array_cuda_space_ptr(true);
+    if ( m_stream ) {
+      for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
+        cudaStreamDestroy( m_stream[i] );
+        m_stream[i] = 0 ;
+      }
+      ::free( m_stream );
+    }
+
+    m_scratchSpaceTracker.clear();
+    m_scratchFlagsTracker.clear();
+    m_scratchUnifiedTracker.clear();
+
+    m_cudaDev             = -1 ;
+    m_maxWarpCount        = 0 ;
+    m_maxBlock            = 0 ;
+    m_maxSharedWords      = 0 ;
+    m_scratchSpaceCount   = 0 ;
+    m_scratchFlagsCount   = 0 ;
+    m_scratchUnifiedCount = 0 ;
+    m_streamCount         = 0 ;
+    m_scratchSpace        = 0 ;
+    m_scratchFlags        = 0 ;
+    m_scratchUnified      = 0 ;
+    m_stream              = 0 ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+Cuda::size_type cuda_internal_maximum_warp_count()
+{ return CudaInternal::singleton().m_maxWarpCount ; }
+
+Cuda::size_type cuda_internal_maximum_grid_count()
+{ return CudaInternal::singleton().m_maxBlock ; }
+
+Cuda::size_type cuda_internal_maximum_shared_words()
+{ return CudaInternal::singleton().m_maxSharedWords ; }
+
+Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_space( size ); }
+
+Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_flags( size ); }
+
+Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_unified( size ); }
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+Cuda::size_type Cuda::detect_device_count()
+{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
+
+int Cuda::is_initialized()
+{ return Impl::CudaInternal::singleton().is_initialized(); }
+
+void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
+{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); }
+
+std::vector<unsigned>
+Cuda::detect_device_arch()
+{
+  const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton();
+
+  std::vector<unsigned> output( s.m_cudaDevCount );
+
+  for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) {
+    output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ;
+  }
+
+  return output ;
+}
+
+Cuda::size_type Cuda::device_arch()
+{
+  const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ;
+
+  int dev_arch = 0 ;
+
+  if ( 0 <= dev_id ) {
+    const struct cudaDeviceProp & cudaProp =
+      Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ;
+
+    dev_arch = cudaProp.major * 100 + cudaProp.minor ;
+  }
+
+  return dev_arch ;
+}
+
+void Cuda::finalize()
+{ Impl::CudaInternal::singleton().finalize(); }
+
+Cuda::Cuda()
+  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
+  , m_stream( 0 )
+{
+  Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
+}
+
+Cuda::Cuda( const int instance_id )
+  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
+  , m_stream(
+      Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" )
+        ? Impl::CudaInternal::singleton().m_stream[ instance_id % Impl::CudaInternal::singleton().m_streamCount ]
+        : 0 )
+{}
+
+void Cuda::print_configuration( std::ostream & s , const bool )
+{ Impl::CudaInternal::singleton().print_configuration( s ); }
+
+bool Cuda::sleep() { return false ; }
+
+bool Cuda::wake() { return true ; }
+
+void Cuda::fence()
+{
+  Kokkos::Impl::cuda_device_synchronize();
+}
+
+} // namespace Kokkos
+
+#endif // KOKKOS_HAVE_CUDA
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@ -0,0 +1,165 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_INTERNAL_HPP
+#define KOKKOS_CUDA_INTERNAL_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+namespace Kokkos { namespace Impl {
+
+
+template<class DriverType>
+int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
+#if ( CUDA_VERSION < 6050 )
+  return 256;
+#else
+  bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
+
+  int numBlocks;
+  if(Large) {
+    int blockSize=32;
+    int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocks,
+        cuda_parallel_launch_constant_memory<DriverType>,
+        blockSize,
+        sharedmem);
+
+    while (blockSize<1024 && numBlocks>0) {
+      blockSize*=2;
+      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &numBlocks,
+          cuda_parallel_launch_constant_memory<DriverType>,
+          blockSize,
+          sharedmem);
+    }
+    if(numBlocks>0) return blockSize;
+    else return blockSize/2;
+  } else {
+    int blockSize=32;
+    int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocks,
+        cuda_parallel_launch_local_memory<DriverType>,
+        blockSize,
+        sharedmem);
+
+    while (blockSize<1024 && numBlocks>0) {
+      blockSize*=2;
+      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &numBlocks,
+          cuda_parallel_launch_local_memory<DriverType>,
+          blockSize,
+          sharedmem);
+    }
+    if(numBlocks>0) return blockSize;
+    else return blockSize/2;
+  }
+#endif
+}
+
+template<class DriverType>
+int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
+#if ( CUDA_VERSION < 6050 )
+  return 256;
+#else
+  bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
+
+  int blockSize=16;
+  int numBlocks;
+  int sharedmem;
+  int maxOccupancy=0;
+  int bestBlockSize=0;
+
+  if(Large) {
+    while(blockSize<1024) {
+      blockSize*=2;
+
+      //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
+      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &numBlocks,
+              cuda_parallel_launch_constant_memory<DriverType>,
+              blockSize,
+              sharedmem);
+      if(maxOccupancy < numBlocks*blockSize) {
+        maxOccupancy = numBlocks*blockSize;
+        bestBlockSize = blockSize;
+      }
+    }
+  } else {
+    while(blockSize<1024) {
+      blockSize*=2;
+      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &numBlocks,
+              cuda_parallel_launch_local_memory<DriverType>,
+              blockSize,
+              sharedmem);
+
+      if(maxOccupancy < numBlocks*blockSize) {
+        maxOccupancy = numBlocks*blockSize;
+        bestBlockSize = blockSize;
+      }
+    }
+  }
+  return bestBlockSize;
+#endif
+}
+
+}} // namespace Kokkos::Impl
+
+#endif // KOKKOS_HAVE_CUDA
+#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -0,0 +1,424 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
+#define KOKKOS_CUDA_REDUCESCAN_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <utility>
+
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+
+
+//Shfl based reductions
+/*
+ *  Algorithmic constraints:
+ *   (a) threads with same threadIdx.y have same value
+ *   (b) blockDim.x == power of two
+ *   (c) blockDim.z == 1
+ */
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_intra_warp_reduction( ValueType& result,
+                                       const JoinOp& join,
+                                       const int max_active_thread = blockDim.y) {
+
+  unsigned int shift = 1;
+
+  //Reduce over values from threads with different threadIdx.y
+  while(blockDim.x * shift < 32 ) {
+    const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
+    //Only join if upper thread is active (this allows non power of two for blockDim.y
+    if(threadIdx.y + shift < max_active_thread)
+      join(result , tmp);
+    shift*=2;
+  }
+
+  result = shfl(result,0,32);
+}
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_inter_warp_reduction( ValueType& value,
+                                       const JoinOp& join,
+                                       const int max_active_thread = blockDim.y) {
+
+  #define STEP_WIDTH 4
+  __shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
+  ValueType* result = (ValueType*) & sh_result;
+  const unsigned step = 32 / blockDim.x;
+  unsigned shift = STEP_WIDTH;
+  const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
+  if(id < STEP_WIDTH ) {
+    result[id] = value;
+  }
+  __syncthreads();
+  while (shift<=max_active_thread/step) {
+    if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
+      join(result[id%STEP_WIDTH],value);
+    }
+    __syncthreads();
+    shift+=STEP_WIDTH;
+  }
+
+
+  value = result[0];
+  for(int i = 1; (i*step<=max_active_thread) && i<STEP_WIDTH; i++)
+    join(value,result[i]);
+}
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_intra_block_reduction( ValueType& value,
+                                        const JoinOp& join,
+                                        const int max_active_thread = blockDim.y) {
+  cuda_intra_warp_reduction(value,join,max_active_thread);
+  cuda_inter_warp_reduction(value,join,max_active_thread);
+}
+
+template< class FunctorType , class JoinOp>
+__device__
+bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type  value,
+                                 const JoinOp& join,
+                                 Cuda::size_type * const m_scratch_space,
+                                 typename FunctorValueTraits< FunctorType , void >::pointer_type const result,
+                                 Cuda::size_type * const m_scratch_flags,
+                                 const int max_active_thread = blockDim.y) {
+  typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type;
+  typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type;
+
+  //Do the intra-block reduction with shfl operations and static shared memory
+  cuda_intra_block_reduction(value,join,max_active_thread);
+
+  const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+
+  //One thread in the block writes block result to global scratch_memory
+  if(id == 0 ) {
+    pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
+    *global = value;
+  }
+
+  //One warp of last block performs inter block reduction through loading the block values from global scratch_memory
+  bool last_block = false;
+
+  __syncthreads();
+  if ( id < 32 ) {
+    Cuda::size_type count;
+
+    //Figure out whether this is the last block
+    if(id == 0)
+      count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
+    count = Kokkos::shfl(count,0,32);
+
+    //Last block does the inter block reduction
+    if( count == gridDim.x - 1) {
+      //set flag back to zero
+      if(id == 0)
+        *m_scratch_flags = 0;
+      last_block = true;
+      value = 0;
+
+      pointer_type const volatile global = (pointer_type) m_scratch_space ;
+
+      //Reduce all global values with splitting work over threads in one warp
+      const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
+      for(int i=id; i<gridDim.x; i+=step_size) {
+        value_type tmp = global[i];
+        join(value, tmp);
+      }
+
+      //Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
+      if (blockDim.x*blockDim.y > 1) {
+        value_type tmp = Kokkos::shfl_down(value, 1,32);
+        if( id + 1 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 2) {
+        value_type tmp = Kokkos::shfl_down(value, 2,32);
+        if( id + 2 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 4) {
+        value_type tmp = Kokkos::shfl_down(value, 4,32);
+        if( id + 4 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 8) {
+        value_type tmp = Kokkos::shfl_down(value, 8,32);
+        if( id + 8 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 16) {
+        value_type tmp = Kokkos::shfl_down(value, 16,32);
+        if( id + 16 < gridDim.x )
+          join(value, tmp);
+      }
+    }
+  }
+
+  //The last block has in its thread=0 the global reduction value through "value"
+  return last_block;
+}
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize shared memory and minimize L1 cache:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
+// For 2.0 capability: 48 KB shared and 16 KB L1
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) blockDim.y is a power of two
+ *   (b) blockDim.y <= 512
+ *   (c) blockDim.x == blockDim.z == 1
+ */
+
+template< bool DoScan , class FunctorType , class ArgTag >
+__device__
+void cuda_intra_block_reduce_scan( const FunctorType & functor ,
+                                   const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
+{
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type  pointer_type ;
+
+  const unsigned value_count   = ValueTraits::value_count( functor );
+  const unsigned BlockSizeMask = blockDim.y - 1 ;
+
+  // Must have power of two thread count
+
+  if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
+
+#define BLOCK_REDUCE_STEP( R , TD , S )  \
+  if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
+
+#define BLOCK_SCAN_STEP( TD , N , S )  \
+  if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
+
+  const unsigned     rtid_intra = threadIdx.y ^ BlockSizeMask ;
+  const pointer_type tdata_intra = base_data + value_count * threadIdx.y ;
+
+  { // Intra-warp reduction:
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
+  }
+
+  __syncthreads(); // Wait for all warps to reduce
+
+  { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
+    const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
+
+    if ( rtid_inter < blockDim.y ) {
+
+      const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
+
+      if ( (1<<5) < BlockSizeMask ) {                        BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
+      if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
+      if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
+      if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
+
+      if ( DoScan ) {
+
+        int n = ( rtid_inter &  32 ) ?  32 : (
+                ( rtid_inter &  64 ) ?  64 : (
+                ( rtid_inter & 128 ) ? 128 : (
+                ( rtid_inter & 256 ) ? 256 : 0 )));
+
+        if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
+
+        BLOCK_SCAN_STEP(tdata_inter,n,8)
+        BLOCK_SCAN_STEP(tdata_inter,n,7)
+        BLOCK_SCAN_STEP(tdata_inter,n,6)
+        BLOCK_SCAN_STEP(tdata_inter,n,5)
+      }
+    }
+  }
+
+  __syncthreads(); // Wait for inter-warp reduce-scan to complete
+
+  if ( DoScan ) {
+    int n = ( rtid_intra &  1 ) ?  1 : (
+            ( rtid_intra &  2 ) ?  2 : (
+            ( rtid_intra &  4 ) ?  4 : (
+            ( rtid_intra &  8 ) ?  8 : (
+            ( rtid_intra & 16 ) ? 16 : 0 ))));
+
+    if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
+
+    BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0)
+  }
+
+#undef BLOCK_SCAN_STEP
+#undef BLOCK_REDUCE_STEP
+}
+
+//----------------------------------------------------------------------------
+/**\brief  Input value-per-thread starting at 'shared_data'.
+ *         Reduction value at last thread's location.
+ *
+ *  If 'DoScan' then write blocks' scan values and block-groups' scan values.
+ *
+ *  Global reduce result is in the last threads' 'shared_data' location.
+ */
+template< bool DoScan , class FunctorType , class ArgTag >
+__device__
+bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
+                                          const Cuda::size_type   block_id ,
+                                          const Cuda::size_type   block_count ,
+                                          Cuda::size_type * const shared_data ,
+                                          Cuda::size_type * const global_data ,
+                                          Cuda::size_type * const global_flags )
+{
+  typedef Cuda::size_type                  size_type ;
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+  typedef FunctorValueInit<   FunctorType , ArgTag >  ValueInit ;
+  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const unsigned BlockSizeMask  = blockDim.y - 1 ;
+  const unsigned BlockSizeShift = power_of_two_if_valid( blockDim.y );
+
+  // Must have power of two thread count
+  if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); }
+
+  const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+    word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
+
+  // Reduce the accumulation for the entire block.
+  cuda_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+  {
+    // Write accumulation total to global scratch space.
+    // Accumulation total is the last thread's data.
+    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
+    size_type * const global = global_data + word_count.value * block_id ;
+
+    for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
+  }
+
+  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
+  // If this block is not the last block to contribute to this group then the block is done.
+  const bool is_last_block =
+    ! __syncthreads_or( threadIdx.y ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
+
+  if ( is_last_block ) {
+
+    const size_type b = ( long(block_count) * long(threadIdx.y) ) >> BlockSizeShift ;
+    const size_type e = ( long(block_count) * long( threadIdx.y + 1 ) ) >> BlockSizeShift ;
+
+    {
+      void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
+      reference_type shared_value = ValueInit::init( functor , shared_ptr );
+
+      for ( size_type i = b ; i < e ; ++i ) {
+        ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
+      }
+    }
+
+    cuda_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+    if ( DoScan ) {
+
+      size_type * const shared_value = shared_data + word_count.value * ( threadIdx.y ? threadIdx.y - 1 : blockDim.y );
+
+      if ( ! threadIdx.y ) { ValueInit::init( functor , shared_value ); }
+
+      // Join previous inclusive scan value to each member
+      for ( size_type i = b ; i < e ; ++i ) {
+        size_type * const global_value = global_data + word_count.value * i ;
+        ValueJoin::join( functor , shared_value , global_value );
+        ValueOps ::copy( functor , global_value , shared_value );
+      }
+    }
+  }
+
+  return is_last_block ;
+}
+
+// Size in bytes required for inter block reduce or scan
+template< bool DoScan , class FunctorType , class ArgTag >
+inline
+unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
+{
+  return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( __CUDACC__ ) */
+#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@ -0,0 +1,298 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_CUDA_VECTORIZATION_HPP
+#define KOKKOS_CUDA_VECTORIZATION_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Kokkos_Cuda.hpp>
+
+namespace Kokkos {
+
+
+// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
+// or other GPUs.  We provide a generic definition (which is trivial
+// and doesn't do what it claims to do) because we don't actually use
+// this function unless we are on a suitable GPU, with a suitable
+// Scalar type.  (For example, in the mat-vec, the "ThreadsPerRow"
+// internal parameter depends both on the ExecutionSpace and the Scalar type,
+// and it controls whether shfl_down() gets called.)
+namespace Impl {
+
+  template< typename Scalar >
+  struct shfl_union {
+    enum {n = sizeof(Scalar)/4};
+    float fval[n];
+    KOKKOS_INLINE_FUNCTION
+    Scalar value() {
+      return *(Scalar*) fval;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (const Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+
+  };
+}
+
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl(const int &val, const int& srcLane, const int& width ) {
+      return __shfl(val,srcLane,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl(const float &val, const int& srcLane, const int& width ) {
+      return __shfl(val,srcLane,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
+        ) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl(tmp,srcLane,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl(const double &val, const int& srcLane, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl(lo,srcLane,width);
+      hi = __shfl(hi,srcLane,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl(lo,srcLane,width);
+      hi = __shfl(hi,srcLane,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_down(const int &val, const int& delta, const int& width) {
+      return __shfl_down(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_down(const float &val, const int& delta, const int& width) {
+      return __shfl_down(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl_down(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_down(const double &val, const int& delta, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_up(const int &val, const int& delta, const int& width ) {
+      return __shfl_up(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_up(const float &val, const int& delta, const int& width ) {
+      return __shfl_up(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl_up(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_up(const double &val, const int& delta, const int& width ) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl_up(lo,delta,width);
+      hi = __shfl_up(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl_up(lo,delta,width);
+      hi = __shfl_up(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+  #else
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+  #endif
+#else
+    template<typename Scalar>
+    inline
+    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+#endif
+
+
+
+}
+
+#endif // KOKKOS_HAVE_CUDA
+#endif
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -0,0 +1,312 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_VIEW_HPP
+#define KOKKOS_CUDA_VIEW_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <cstring>
+
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_CudaSpace.hpp>
+#include <Kokkos_View.hpp>
+
+#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct AssertShapeBoundsAbort< CudaSpace >
+{
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t /* rank */ ,
+                     const size_t /* n0 */ , const size_t /* n1 */ ,
+                     const size_t /* n2 */ , const size_t /* n3 */ ,
+                     const size_t /* n4 */ , const size_t /* n5 */ ,
+                     const size_t /* n6 */ , const size_t /* n7 */ ,
+
+                     const size_t /* arg_rank */ ,
+                     const size_t /* i0 */ , const size_t /* i1 */ ,
+                     const size_t /* i2 */ , const size_t /* i3 */ ,
+                     const size_t /* i4 */ , const size_t /* i5 */ ,
+                     const size_t /* i6 */ , const size_t /* i7 */ )
+    {
+      Kokkos::abort("Kokkos::View array bounds violation");
+    }
+};
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
+// Via reinterpret_case this can be used to support all scalar types of those sizes.
+// Any other scalar type falls back to either normal reads out of global memory,
+// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
+
+template< typename ValueType
+        , class MemorySpace
+        , class AliasType =
+            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  4 ) , int ,
+            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  8 ) , ::int2 ,
+            typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 , void
+            >::type
+            >::type
+            >::type
+        >
+class CudaTextureFetch {
+private:
+
+  cuda_texture_object_type  m_obj ;
+  const ValueType         * m_alloc_ptr ;
+  int                       m_offset ;
+
+  void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
+  {
+    typedef char const * const byte;
+
+    m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
+
+    size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
+    const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
+
+    const size_t count = tracker.alloc_size() / sizeof(ValueType);
+    const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
+
+    if (ok_aligned && ok_contains) {
+      if (tracker.attribute() == NULL ) {
+        MemorySpace::texture_object_attach(
+            tracker
+            , sizeof(ValueType)
+            , cudaCreateChannelDesc< AliasType >()
+            );
+      }
+      m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
+      m_offset = arg_ptr - m_alloc_ptr;
+    }
+    else if( !ok_contains ) {
+      throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
+    }
+    else {
+      throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
+    }
+  }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaTextureFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs )
+    : m_obj(       rhs.m_obj )
+    , m_alloc_ptr( rhs.m_alloc_ptr )
+    , m_offset(    rhs.m_offset )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
+    {
+      m_obj       = rhs.m_obj ;
+      m_alloc_ptr = rhs.m_alloc_ptr ;
+      m_offset    = rhs.m_offset ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION explicit
+  CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
+    : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
+    {
+      #if defined( KOKKOS_USE_LDG_INTRINSIC )
+        m_alloc_ptr(arg_ptr);
+      #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
+        if ( arg_ptr != NULL ) {
+          if ( tracker.is_valid() ) {
+            attach( arg_ptr, tracker );
+          }
+          else {
+            AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
+            if ( found_tracker.is_valid() ) {
+              attach( arg_ptr, found_tracker );
+            } else {
+              throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
+            }
+          }
+        }
+      #endif
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
+
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+    {
+      #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+        AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
+        return  *(reinterpret_cast<ValueType*> (&v));
+      #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+        AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
+        return  *(reinterpret_cast<ValueType*> (&v));
+      #else
+        return m_alloc_ptr[ i + m_offset ];
+      #endif
+  }
+};
+
+template< typename ValueType, class MemorySpace >
+class CudaTextureFetch< const ValueType, MemorySpace, void >
+{
+private:
+  const ValueType * m_ptr ;
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch() : m_ptr(0) {};
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaTextureFetch() {
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
+    m_ptr = rhs.m_ptr;
+    return *this ;
+  }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) {
+    m_ptr = base_view_ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
+    m_ptr = base_view_ptr;
+    return *this;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_ptr ; }
+
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+  {
+    return m_ptr[ i ];
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
+ *          if 'const' value type, CudaSpace and random access.
+ */
+template< class ViewTraits >
+class ViewDataHandle< ViewTraits ,
+  typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value ||
+                        is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value )
+                      &&
+                      is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value
+                      &&
+                      ViewTraits::memory_traits::RandomAccess
+                    >::type >
+{
+public:
+  enum { ReturnTypeIsReference = false };
+
+  typedef Impl::CudaTextureFetch< typename ViewTraits::value_type
+                                , typename ViewTraits::memory_space> handle_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker )
+  {
+    return handle_type(arg_data_ptr, arg_tracker);
+  }
+
+  typedef typename ViewTraits::value_type return_type;
+};
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif // KOKKOS_HAVE_CUDA
+#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ABORT_HPP
+#define KOKKOS_CUDA_ABORT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+#include "Kokkos_Macros.hpp"
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <cuda.h>
+
+#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 )
+#error "Cuda version 4.1 or greater required"
+#endif
+
+#if ( __CUDA_ARCH__ < 200 )
+#error "Cuda device capability 2.0 or greater required"
+#endif
+
+extern "C" {
+/*  Cuda runtime function, declared in <crt/device_runtime.h>
+ *  Requires capability 2.x or better.
+ */
+extern __device__ void __assertfail(
+  const void  *message,
+  const void  *file,
+  unsigned int line,
+  const void  *function,
+  size_t       charsize);
+}
+
+namespace Kokkos {
+namespace Impl {
+
+__device__ inline
+void cuda_abort( const char * const message )
+{
+#ifndef __APPLE__
+  const char empty[] = "" ;
+
+  __assertfail( (const void *) message ,
+                (const void *) empty ,
+                (unsigned int) 0 ,
+                (const void *) empty ,
+                sizeof(char) );
+#endif
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#else
+
+namespace Kokkos {
+namespace Impl {
+KOKKOS_INLINE_FUNCTION
+void cuda_abort( const char * const ) {}
+}
+}
+
+#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
+namespace Kokkos {
+__device__ inline
+void abort( const char * const message ) { Kokkos::Impl::cuda_abort(message); }
+}
+#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
+
--- a/lib/kokkos/core/src/KokkosExp_View.hpp
+++ b/lib/kokkos/core/src/KokkosExp_View.hpp
--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@ -0,0 +1,285 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Atomic.hpp
+/// \brief Atomic functions
+///
+/// This header file defines prototypes for the following atomic functions:
+///   - exchange
+///   - compare and exchange
+///   - add
+///
+/// Supported types include:
+///   - signed and unsigned 4 and 8 byte integers
+///   - float
+///   - double
+///
+/// They are implemented through GCC compatible intrinsics, OpenMP
+/// directives and native CUDA intrinsics.
+///
+/// Including this header file requires one of the following
+/// compilers:
+///   - NVCC (for CUDA device code only)
+///   - GCC (for host code only)
+///   - Intel (for host code only)
+///   - A compiler that supports OpenMP 3.1 (for host code only)
+
+#ifndef KOKKOS_ATOMIC_HPP
+#define KOKKOS_ATOMIC_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+#if defined(_WIN32)
+#define KOKKOS_ATOMICS_USE_WINDOWS
+#else
+#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+
+// Compiling NVIDIA device code, must use Cuda atomics:
+
+#define KOKKOS_ATOMICS_USE_CUDA
+
+#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
+      ! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
+      ! defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+// Compiling for non-Cuda atomic implementation has not been pre-selected.
+// Choose the best implementation for the detected compiler.
+// Preference: GCC, INTEL, OMP31
+
+#if defined( KOKKOS_COMPILER_GNU ) || \
+    defined( KOKKOS_COMPILER_CLANG ) || \
+    ( defined ( KOKKOS_COMPILER_NVCC ) && defined ( __GNUC__ ) )
+
+#define KOKKOS_ATOMICS_USE_GCC
+
+#elif defined( KOKKOS_COMPILER_INTEL ) || \
+      defined( KOKKOS_COMPILER_CRAYC )
+
+#define KOKKOS_ATOMICS_USE_INTEL
+
+#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
+
+#define KOKKOS_ATOMICS_USE_OMP31
+
+#else
+
+#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
+
+#endif
+
+#endif /* Not pre-selected atomic implementation */
+#endif
+
+//----------------------------------------------------------------------------
+
+// Forward decalaration of functions supporting arbitrary sized atomics
+// This is necessary since Kokkos_Atomic.hpp is internally included very early
+// through Kokkos_HostSpace.hpp as well as the allocation tracker.
+#ifdef KOKKOS_HAVE_CUDA
+namespace Kokkos {
+namespace Impl {
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline
+bool lock_address_cuda_space(void* ptr);
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+__device__ inline
+void unlock_address_cuda_space(void* ptr);
+}
+}
+#endif
+
+
+namespace Kokkos {
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T * const dest, const T src);
+
+// Atomic increment
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* a);
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* a);
+}
+
+#if ! defined(_WIN32)
+#include<impl/Kokkos_Atomic_Assembly_X86.hpp>
+#endif
+
+namespace Kokkos {
+
+
+inline
+const char * atomic_query_version()
+{
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+  return "KOKKOS_ATOMICS_USE_CUDA" ;
+#elif defined( KOKKOS_ATOMICS_USE_GCC )
+  return "KOKKOS_ATOMICS_USE_GCC" ;
+#elif defined( KOKKOS_ATOMICS_USE_INTEL )
+  return "KOKKOS_ATOMICS_USE_INTEL" ;
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+  return "KOKKOS_ATOMICS_USE_OMP31" ;
+#elif defined( KOKKOS_ATOMICS_USE_WINDOWS )
+  return "KOKKOS_ATOMICS_USE_WINDOWS";
+#endif
+}
+
+} // namespace Kokkos
+
+#ifdef _WIN32
+#include "impl/Kokkos_Atomic_Windows.hpp"
+#else
+//#include "impl/Kokkos_Atomic_Assembly_X86.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic exchange
+//
+// template< typename T >
+// T atomic_exchange( volatile T* const dest , const T val )
+// { T tmp = *dest ; *dest = val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Exchange.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic compare-and-exchange
+//
+// template<class T>
+// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; }
+
+#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and add
+//
+// template<class T>
+// T atomic_fetch_add(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest += val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and sub
+//
+// template<class T>
+// T atomic_fetch_sub(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest -= val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Sub.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and or
+//
+// template<class T>
+// T atomic_fetch_or(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp | val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Or.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and and
+//
+// template<class T>
+// T atomic_fetch_and(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp & val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_And.hpp"
+#endif /*Not _WIN32*/
+
+//----------------------------------------------------------------------------
+// Memory fence
+//
+// All loads and stores from this thread will be globally consistent before continuing
+//
+// void memory_fence() {...};
+#include "impl/Kokkos_Memory_Fence.hpp"
+
+//----------------------------------------------------------------------------
+// Provide volatile_load and safe_load
+//
+// T volatile_load(T const volatile * const ptr);
+//
+// T const& safe_load(T const * const ptr);
+// XEON PHI
+// T safe_load(T const * const ptr
+
+#include "impl/Kokkos_Volatile_Load.hpp"
+
+#ifndef _WIN32
+#include "impl/Kokkos_Atomic_Generic.hpp"
+#endif
+//----------------------------------------------------------------------------
+// This atomic-style macro should be an inlined function, not a macro
+
+#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__)
+
+  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
+  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)
+
+#else
+
+  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
+  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_ATOMIC_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -0,0 +1,228 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_HPP
+#define KOKKOS_CORE_HPP
+
+//----------------------------------------------------------------------------
+// Include the execution space header files for the enabled execution spaces.
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+#include <Kokkos_Cuda.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+#include <Kokkos_OpenMP.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+#include <Kokkos_Serial.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+#include <Kokkos_Threads.hpp>
+#endif
+
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_Vectorization.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_hwloc.hpp>
+
+#include <iostream>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct InitArguments {
+  int num_threads;
+  int num_numa;
+  int device_id;
+
+  InitArguments() {
+    num_threads = -1;
+    num_numa = -1;
+    device_id = -1;
+  }
+};
+
+void initialize(int& narg, char* arg[]);
+
+void initialize(const InitArguments& args = InitArguments());
+
+/** \brief  Finalize the spaces that were initialized via Kokkos::initialize */
+void finalize();
+
+/** \brief  Finalize all known execution spaces */
+void finalize_all();
+
+void fence();
+
+}
+
+#ifdef KOKKOS_HAVE_CXX11
+namespace Kokkos {
+
+namespace Impl {
+// should only by used by kokkos_malloc and kokkos_free
+struct MallocHelper
+{
+  static void increment_ref_count( AllocationTracker const & tracker )
+  {
+    tracker.increment_ref_count();
+  }
+
+  static void decrement_ref_count( AllocationTracker const & tracker )
+  {
+    tracker.decrement_ref_count();
+  }
+};
+} // namespace Impl
+
+/* Allocate memory from a memory space.
+ * The allocation is tracked in Kokkos memory tracking system, so
+ * leaked memory can be identified.
+ */
+template< class Arg = DefaultExecutionSpace>
+void* kokkos_malloc(const std::string label, size_t count) {
+  typedef typename Arg::memory_space MemorySpace;
+  Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);;
+  Impl::MallocHelper::increment_ref_count( tracker );
+  return tracker.alloc_ptr();
+}
+
+template< class Arg = DefaultExecutionSpace>
+void* kokkos_malloc(const size_t& count) {
+  return kokkos_malloc<Arg>("DefaultLabel",count);
+}
+
+
+/* Free memory from a memory space.
+ */
+template< class Arg = DefaultExecutionSpace>
+void kokkos_free(const void* ptr) {
+  typedef typename Arg::memory_space MemorySpace;
+  typedef typename MemorySpace::allocator allocator;
+  Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr);
+  if (tracker.is_valid()) {
+    Impl::MallocHelper::decrement_ref_count( tracker );
+  }
+}
+
+
+template< class Arg = DefaultExecutionSpace>
+const void* kokkos_realloc(const void* old_ptr, size_t size) {
+  typedef typename Arg::memory_space MemorySpace;
+  typedef typename MemorySpace::allocator allocator;
+  Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
+
+  tracker.reallocate(size);
+
+  return tracker.alloc_ptr();
+}
+
+} // namespace Kokkos
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void * kokkos_malloc( const size_t arg_alloc_size )
+{
+  typedef typename Space::memory_space  MemorySpace ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void >         RecordBase ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void >  RecordHost ;
+
+  RecordHost * const r = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void kokkos_free( void * arg_alloc )
+{
+  typedef typename Space::memory_space  MemorySpace ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void >         RecordBase ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void >  RecordHost ;
+
+  RecordHost * const r = RecordHost::get_record( arg_alloc );
+
+  RecordBase::decrement( r );
+}
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
+{
+  typedef typename Space::memory_space  MemorySpace ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void >         RecordBase ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void >  RecordHost ;
+
+  RecordHost * const r_old = RecordHost::get_record( arg_alloc );
+  RecordHost * const r_new = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<MemorySpace,MemorySpace>( r_new->data() , r_old->data()
+                                                 , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif
+
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@ -0,0 +1,170 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_FWD_HPP
+#define KOKKOS_CORE_FWD_HPP
+
+//----------------------------------------------------------------------------
+// Kokkos_Macros.hpp does introspection on configuration options
+// and compiler environment then sets a collection of #define macros.
+
+#include <Kokkos_Macros.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Forward declarations for class inter-relationships
+
+namespace Kokkos {
+
+class HostSpace ; ///< Memory space for main process and CPU execution spaces
+
+#if defined( KOKKOS_HAVE_SERIAL )
+class Serial ;    ///< Execution space main process on CPU
+#endif // defined( KOKKOS_HAVE_SERIAL )
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+class Threads ;  ///< Execution space with pthreads back-end
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+class OpenMP ; ///< OpenMP execution space
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+class CudaSpace ;            ///< Memory space on Cuda GPU
+class CudaUVMSpace ;         ///< Memory space on Cuda GPU with UVM
+class CudaHostPinnedSpace ;  ///< Memory space on Host accessible to Cuda GPU
+class Cuda ;                 ///< Execution space for Cuda GPU
+#endif
+
+template<class ExecutionSpace, class MemorySpace>
+struct Device;
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Set the default execution space.
+
+/// Define Kokkos::DefaultExecutionSpace as per configuration option
+/// or chosen from the enabled execution spaces in the following order:
+/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
+
+namespace Kokkos {
+
+#if   defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
+  typedef Cuda DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultExecutionSpace ;
+#else
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#endif
+
+#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_PTHREAD )
+  typedef Threads DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace ;
+#else
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#endif
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Detect the active execution space and define its memory space.
+// This is used to verify whether a running kernel can access
+// a given memory space.
+
+namespace Kokkos {
+namespace Impl {
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_HAVE_CUDA)
+typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace ;
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+typedef Kokkos::HostSpace  ActiveExecutionMemorySpace ;
+#else
+typedef void ActiveExecutionMemorySpace ;
+#endif
+
+template< class ActiveSpace , class MemorySpace >
+struct VerifyExecutionCanAccessMemorySpace {
+  enum {value = 0};
+};
+
+template< class Space >
+struct VerifyExecutionCanAccessMemorySpace< Space , Space >
+{
+  enum {value = 1};
+  KOKKOS_INLINE_FUNCTION static void verify(void) {}
+  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
+
+namespace Kokkos {
+  void fence();
+}
+
+#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@ -0,0 +1,268 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_HPP
+#define KOKKOS_CUDA_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+// If CUDA execution space is enabled then use this header file.
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <iosfwd>
+#include <vector>
+
+#include <Kokkos_CudaSpace.hpp>
+
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class CudaExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class Cuda
+/// \brief Kokkos Execution Space that uses CUDA to run on GPUs.
+///
+/// An "execution space" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads execution space uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP execution space uses the OpenMP language
+/// extensions, and the Serial execution space executes "parallel" kernels
+/// sequentially.  The Cuda execution space uses NVIDIA's CUDA programming
+/// model to execute kernels in parallel on GPUs.
+class Cuda {
+public:
+  //! \name Type declarations that all Kokkos execution spaces must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef Cuda                  execution_space ;
+
+#if defined( KOKKOS_USE_CUDA_UVM )
+  //! This execution space's preferred memory space.
+  typedef CudaUVMSpace          memory_space ;
+#else
+  //! This execution space's preferred memory space.
+  typedef CudaSpace             memory_space ;
+#endif
+
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  //! The size_type best suited for this execution space.
+  typedef memory_space::size_type  size_type ;
+
+  //! This execution space's preferred array layout.
+  typedef LayoutLeft            array_layout ;
+
+  //! 
+  typedef ScratchMemorySpace< Cuda >  scratch_memory_space ;
+
+  //@}
+  //--------------------------------------------------
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#if defined( __CUDA_ARCH__ )
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  //! Free any resources being consumed by the device.
+  static void finalize();
+
+  //! Has been initialized
+  static int is_initialized();
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  //--------------------------------------------------
+  //! \name  Cuda space instances
+
+  ~Cuda() {}
+  Cuda();
+  explicit Cuda( const int instance_id );
+
+  Cuda( const Cuda & ) = default ;
+  Cuda( Cuda && ) = default ;
+  Cuda & operator = ( const Cuda & ) = default ;
+  Cuda & operator = ( Cuda && ) = default ;
+
+  //--------------------------------------------------------------------------
+  //! \name Device-specific functions
+  //@{
+
+  struct SelectDevice {
+    int cuda_device_id ;
+    SelectDevice() : cuda_device_id(0) {}
+    explicit SelectDevice( int id ) : cuda_device_id( id ) {}
+  };
+
+  //! Initialize, telling the CUDA run-time library which device to use.
+  static void initialize( const SelectDevice = SelectDevice()
+                        , const size_t num_instances = 1 );
+
+  /// \brief Cuda device architecture of the selected device.
+  ///
+  /// This matches the __CUDA_ARCH__ specification.
+  static size_type device_arch();
+
+  //! Query device count.
+  static size_type detect_device_count();
+
+  /** \brief  Detect the available devices and their architecture
+   *          as defined by the __CUDA_ARCH__ specification.
+   */
+  static std::vector<unsigned> detect_device_arch();
+
+  cudaStream_t cuda_stream() const { return m_stream ; }
+  int          cuda_device() const { return m_device ; }
+
+  //@}
+  //--------------------------------------------------------------------------
+
+private:
+
+  cudaStream_t m_stream ;
+  int          m_device ;
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::CudaSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::HostSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  enum { value = false };
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_View.hpp>
+
+#include <KokkosExp_View.hpp>
+#include <Cuda/KokkosExp_Cuda_View.hpp>
+
+#include <Cuda/Kokkos_Cuda_Parallel.hpp>
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDA_HPP */
+
+
+
--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@ -0,0 +1,656 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDASPACE_HPP
+#define KOKKOS_CUDASPACE_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <iosfwd>
+#include <typeinfo>
+#include <string>
+
+#include <Kokkos_HostSpace.hpp>
+
+#include <impl/Kokkos_AllocationTracker.hpp>
+
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Cuda on-device memory management */
+
+class CudaSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef CudaSpace             memory_space ;
+  typedef Kokkos::Cuda          execution_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef unsigned int          size_type ;
+
+  typedef Impl::CudaMallocAllocator allocator;
+
+  /** \brief  Allocate a contiguous block of memory.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   */
+  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
+
+  /*--------------------------------*/
+  /** \brief  Cuda specific function to attached texture object to an allocation.
+   *          Output the texture object, base pointer, and offset from the input pointer.
+   */
+#if defined( __CUDACC__ )
+  static void texture_object_attach(  Impl::AllocationTracker const & tracker
+                                    , unsigned type_size
+                                    , ::cudaChannelFormatDesc const & desc
+                                   );
+#endif
+
+  /*--------------------------------*/
+
+  CudaSpace();
+  CudaSpace( const CudaSpace & rhs ) = default ;
+  CudaSpace & operator = ( const CudaSpace & rhs ) = default ;
+  ~CudaSpace() = default ;
+
+  /**\brief  Allocate memory in the cuda space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate memory in the cuda space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /*--------------------------------*/
+  /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
+  static void access_error();
+  static void access_error( const void * const );
+
+private:
+
+  int  m_device ; ///< Which Cuda device
+
+  // friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ;
+};
+
+namespace Impl {
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_array_cuda_space();
+
+/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* lock_array_cuda_space_ptr(bool deallocate = false);
+}
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Cuda memory that is accessible to Host execution space
+ *          through Cuda's unified virtual memory (UVM) runtime.
+ */
+class CudaUVMSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef CudaUVMSpace          memory_space ;
+  typedef Cuda                  execution_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef unsigned int          size_type ;
+
+  /** \brief  If UVM capability is available */
+  static bool available();
+
+  typedef Impl::CudaUVMAllocator allocator;
+
+  /** \brief  Allocate a contiguous block of memory.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   */
+  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
+
+
+  /** \brief  Cuda specific function to attached texture object to an allocation.
+   *          Output the texture object, base pointer, and offset from the input pointer.
+   */
+#if defined( __CUDACC__ )
+  static void texture_object_attach(  Impl::AllocationTracker const & tracker
+                                    , unsigned type_size
+                                    , ::cudaChannelFormatDesc const & desc
+                                   );
+#endif
+  /*--------------------------------*/
+
+  CudaUVMSpace();
+  CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
+  CudaUVMSpace & operator = ( const CudaUVMSpace & rhs ) = default ;
+  ~CudaUVMSpace() = default ;
+
+  /**\brief  Allocate memory in the cuda space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate memory in the cuda space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /*--------------------------------*/
+
+private:
+
+  int  m_device ; ///< Which Cuda device
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Host memory that is accessible to Cuda execution space
+ *          through Cuda's host-pinned memory allocation.
+ */
+class CudaHostPinnedSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  /** \brief  Memory is in HostSpace so use the HostSpace::execution_space */
+  typedef HostSpace::execution_space  execution_space ;
+  typedef CudaHostPinnedSpace         memory_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef unsigned int                size_type ;
+
+
+  typedef Impl::CudaHostAllocator allocator ;
+
+  /** \brief  Allocate a contiguous block of memory.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   */
+  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
+
+  /*--------------------------------*/
+
+  CudaHostPinnedSpace();
+  CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;
+  CudaHostPinnedSpace & operator = ( const CudaHostPinnedSpace & rhs ) = default ;
+  ~CudaHostPinnedSpace() = default ;
+
+  /**\brief  Allocate memory in the cuda space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate memory in the cuda space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /*--------------------------------*/
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<> struct DeepCopy< CudaSpace , CudaSpace >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< CudaSpace , HostSpace >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< HostSpace , CudaSpace >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< CudaSpace , CudaUVMSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaSpace , CudaHostPinnedSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
+};
+
+
+template<> struct DeepCopy< CudaUVMSpace , CudaSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaUVMSpace , CudaUVMSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaUVMSpace , CudaHostPinnedSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaUVMSpace , HostSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
+};
+
+
+template<> struct DeepCopy< CudaHostPinnedSpace , CudaSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaHostPinnedSpace , CudaUVMSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaHostPinnedSpace , CudaHostPinnedSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaHostPinnedSpace , HostSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
+};
+
+
+template<> struct DeepCopy< HostSpace , CudaUVMSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< HostSpace , CudaHostPinnedSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** Running in CudaSpace attempting to access HostSpace: error */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::HostSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
+};
+
+/** Running in CudaSpace accessing CudaUVMSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaUVMSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in CudaSpace attempting to access an unknown space: error */
+template< class OtherSpace >
+struct VerifyExecutionCanAccessMemorySpace<
+  typename enable_if< ! is_same<Kokkos::CudaSpace,OtherSpace>::value , Kokkos::CudaSpace >::type ,
+  OtherSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
+};
+
+//----------------------------------------------------------------------------
+/** Running in HostSpace attempting to access CudaSpace */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaSpace >
+{
+  enum { value = false };
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+};
+
+/** Running in HostSpace accessing CudaUVMSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaUVMSpace >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+/** Running in HostSpace accessing CudaHostPinnedSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) {}
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  friend class SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static ::cudaTextureObject_t
+  attach_texture_object( const unsigned sizeof_alias
+                       , void * const   alloc_ptr
+                       , const size_t   alloc_size ); 
+
+  static RecordBase s_root_record ;
+
+  ::cudaTextureObject_t   m_tex_obj ;
+  const Kokkos::CudaSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaSpace &  arg_space
+                                          , const std::string       &  arg_label
+                                          , const size_t               arg_alloc_size
+                                          );
+
+  template< typename AliasType >
+  inline
+  ::cudaTextureObject_t attach_texture_object()
+    {
+      static_assert( ( std::is_same< AliasType , int >::value ||
+                       std::is_same< AliasType , ::int2 >::value ||
+                       std::is_same< AliasType , ::int4 >::value )
+                   , "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" );
+
+      if ( m_tex_obj == 0 ) {
+        m_tex_obj = attach_texture_object( sizeof(AliasType)
+                                         , (void*) RecordBase::m_alloc_ptr
+                                         , RecordBase::m_alloc_size );
+      }
+
+      return m_tex_obj ;
+    }
+
+  template< typename AliasType >
+  inline
+  int attach_texture_object_offset( const AliasType * const ptr )
+    {
+      // Texture object is attached to the entire allocation range
+      return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
+    }
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::CudaSpace & , bool detail = false );
+};
+
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaUVMSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  ::cudaTextureObject_t      m_tex_obj ;
+  const Kokkos::CudaUVMSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaUVMSpace     & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaUVMSpace &  arg_space
+                                          , const std::string          &  arg_label
+                                          , const size_t                  arg_alloc_size
+                                          );
+
+  template< typename AliasType >
+  inline
+  ::cudaTextureObject_t attach_texture_object()
+    {
+      static_assert( ( std::is_same< AliasType , int >::value ||
+                       std::is_same< AliasType , ::int2 >::value ||
+                       std::is_same< AliasType , ::int4 >::value )
+                   , "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" );
+
+      if ( m_tex_obj == 0 ) {
+        m_tex_obj = SharedAllocationRecord< Kokkos::CudaSpace , void >::
+          attach_texture_object( sizeof(AliasType)
+                               , (void*) RecordBase::m_alloc_ptr
+                               , RecordBase::m_alloc_size );
+      }
+
+      return m_tex_obj ;
+    }
+
+  template< typename AliasType >
+  inline
+  int attach_texture_object_offset( const AliasType * const ptr )
+    {
+      // Texture object is attached to the entire allocation range
+      return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
+    }
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::CudaUVMSpace & , bool detail = false );
+};
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  const Kokkos::CudaHostPinnedSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace     & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaHostPinnedSpace &  arg_space
+                                          , const std::string          &  arg_label
+                                          , const size_t                  arg_alloc_size
+                                          );
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::CudaHostPinnedSpace & , bool detail = false );
+};
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #define KOKKOS_CUDASPACE_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@ -0,0 +1,497 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXECPOLICY_HPP
+#define KOKKOS_EXECPOLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Execution policy for work over a range of an integral type.
+ *
+ * Valid template argument options:
+ *
+ *  With a specified execution space:
+ *    < ExecSpace , WorkTag , { IntConst | IntType } >
+ *    < ExecSpace , WorkTag , void >
+ *    < ExecSpace , { IntConst | IntType } , void >
+ *    < ExecSpace , void , void >
+ *
+ *  With the default execution space:
+ *    < WorkTag , { IntConst | IntType } , void >
+ *    < WorkTag , void , void >
+ *    < { IntConst | IntType } , void , void >
+ *    < void , void , void >
+ *
+ *  IntType  is a fundamental integral type
+ *  IntConst is an Impl::integral_constant< IntType , Blocking >
+ *
+ *  Blocking is the granularity of partitioning the range among threads.
+ */
+template< class Arg0 = void , class Arg1 = void , class Arg2 = void 
+        , class ExecSpace =
+          // The first argument is the execution space,
+          // otherwise use the default execution space.
+          typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
+                             , Kokkos::DefaultExecutionSpace >::type
+        >
+class RangePolicy {
+private:
+
+  // Default integral type and blocking factor:
+  typedef int DefaultIntType ;
+  enum { DefaultIntValue = 8 };
+
+  enum { Arg0_Void = Impl::is_same< Arg0 , void >::value };
+  enum { Arg1_Void = Impl::is_same< Arg1 , void >::value };
+  enum { Arg2_Void = Impl::is_same< Arg2 , void >::value };
+
+  enum { Arg0_ExecSpace = Impl::is_execution_space< Arg0 >::value };
+
+  enum { Arg0_IntConst = Impl::is_integral_constant< Arg0 >::value };
+  enum { Arg1_IntConst = Impl::is_integral_constant< Arg1 >::value };
+  enum { Arg2_IntConst = Impl::is_integral_constant< Arg2 >::value };
+
+  enum { Arg0_IntType = Impl::is_integral< Arg0 >::value };
+  enum { Arg1_IntType = Impl::is_integral< Arg1 >::value };
+  enum { Arg2_IntType = Impl::is_integral< Arg2 >::value };
+
+  enum { Arg0_WorkTag = ! Arg0_ExecSpace && ! Arg0_IntConst && ! Arg0_IntType && ! Arg0_Void };
+  enum { Arg1_WorkTag =   Arg0_ExecSpace && ! Arg1_IntConst && ! Arg1_IntType && ! Arg1_Void };
+
+  enum { ArgOption_OK = Impl::StaticAssert< (
+    ( Arg0_ExecSpace && Arg1_WorkTag && ( Arg2_IntConst || Arg2_IntType ) ) ||
+    ( Arg0_ExecSpace && Arg1_WorkTag && Arg2_Void ) ||
+    ( Arg0_ExecSpace && ( Arg1_IntConst || Arg1_IntType ) && Arg2_Void ) ||
+    ( Arg0_ExecSpace && Arg1_Void && Arg2_Void ) ||
+    ( Arg0_WorkTag && ( Arg1_IntConst || Arg1_IntType ) && Arg2_Void ) ||
+    ( Arg0_WorkTag && Arg1_Void && Arg2_Void ) ||
+    ( ( Arg0_IntConst || Arg0_IntType ) && Arg1_Void && Arg2_Void ) ||
+    ( Arg0_Void && Arg1_Void && Arg2_Void )
+    ) >::value };
+
+  // The work argument tag is the first or second argument
+  typedef typename Impl::if_c< Arg0_WorkTag , Arg0 ,
+          typename Impl::if_c< Arg1_WorkTag , Arg1 , void
+          >::type >::type
+    WorkTag ;
+
+  enum { Granularity = Arg0_IntConst ? unsigned(Impl::is_integral_constant<Arg0>::integral_value) : (
+                       Arg1_IntConst ? unsigned(Impl::is_integral_constant<Arg1>::integral_value) : (
+                       Arg2_IntConst ? unsigned(Impl::is_integral_constant<Arg2>::integral_value) : (
+                                       unsigned(DefaultIntValue) ))) };
+
+  // Only accept the integral type if the blocking is a power of two
+  typedef typename Impl::enable_if< Impl::is_power_of_two< Granularity >::value ,
+            typename Impl::if_c< Arg0_IntType , Arg0 ,
+            typename Impl::if_c< Arg1_IntType , Arg1 ,
+            typename Impl::if_c< Arg2_IntType , Arg2 ,
+            typename Impl::if_c< Arg0_IntConst , typename Impl::is_integral_constant<Arg0>::integral_type ,
+            typename Impl::if_c< Arg1_IntConst , typename Impl::is_integral_constant<Arg1>::integral_type ,
+            typename Impl::if_c< Arg2_IntConst , typename Impl::is_integral_constant<Arg2>::integral_type ,
+                                                 DefaultIntType
+            >::type >::type >::type
+            >::type >::type >::type
+          >::type
+    IntType ;
+
+  enum { GranularityMask = IntType(Granularity) - 1 };
+
+  ExecSpace m_space ;
+  IntType   m_begin ;
+  IntType   m_end ;
+
+public:
+
+  //! Tag this class as an execution policy
+  typedef ExecSpace    execution_space ;
+  typedef RangePolicy  execution_policy ;
+  typedef WorkTag      work_tag ;
+  typedef IntType      member_type ;
+
+  KOKKOS_INLINE_FUNCTION const execution_space & space() const { return m_space ; }
+  KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
+  KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
+
+  inline RangePolicy() : m_space(), m_begin(0), m_end(0) {}
+
+  /** \brief  Total range */
+  inline
+  RangePolicy( const member_type work_begin
+             , const member_type work_end
+             )
+    : m_space()
+    , m_begin( work_begin < work_end ? work_begin : 0 )
+    , m_end(   work_begin < work_end ? work_end : 0 )
+    {}
+
+  /** \brief  Total range */
+  inline
+  RangePolicy( const execution_space & work_space
+             , const member_type work_begin
+             , const member_type work_end
+             )
+    : m_space( work_space )
+    , m_begin( work_begin < work_end ? work_begin : 0 )
+    , m_end(   work_begin < work_end ? work_end : 0 )
+    {}
+
+  /** \brief  Subrange for a partition's rank and size.
+   *
+   *  Typically used to partition a range over a group of threads.
+   */
+  struct WorkRange {
+    typedef RangePolicy::work_tag     work_tag ;
+    typedef RangePolicy::member_type  member_type ;
+
+    KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
+    KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
+
+    /** \brief  Subrange for a partition's rank and size.
+     *
+     *  Typically used to partition a range over a group of threads.
+     */
+    KOKKOS_INLINE_FUNCTION
+    WorkRange( const RangePolicy & range
+             , const int part_rank
+             , const int part_size
+             )
+      : m_begin(0), m_end(0)
+      {
+        if ( part_size ) {
+  
+          // Split evenly among partitions, then round up to the granularity.
+          const member_type work_part =
+            ( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size )
+              + GranularityMask ) & ~member_type(GranularityMask);
+
+          m_begin = range.begin() + work_part * part_rank ;
+          m_end   = m_begin       + work_part ;
+  
+          if ( range.end() < m_begin ) m_begin = range.end() ;
+          if ( range.end() < m_end )   m_end   = range.end() ;
+        }
+      }
+  private:
+     member_type m_begin ;
+     member_type m_end ;
+     WorkRange();
+     WorkRange & operator = ( const WorkRange & );
+  };
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Execution policy for parallel work over a league of teams of threads.
+ *
+ *  The work functor is called for each thread of each team such that
+ *  the team's member threads are guaranteed to be concurrent.
+ *
+ *  The team's threads have access to team shared scratch memory and
+ *  team collective operations.
+ *
+ *  If the WorkTag is non-void then the first calling argument of the
+ *  work functor's parentheses operator is 'const WorkTag &'.
+ *  This allows a functor to have multiple work member functions.
+ *
+ *  template argument option with specified execution space:
+ *    < ExecSpace , WorkTag >
+ *    < ExecSpace , void >
+ *
+ *  template argument option with default execution space:
+ *    < WorkTag , void >
+ *    < void , void >
+ */
+template< class Arg0 = void
+        , class Arg1 = void
+        , class ExecSpace =
+          // If the first argument is not an execution
+          // then use the default execution space.
+          typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
+                             , Kokkos::DefaultExecutionSpace >::type
+        >
+class TeamPolicy {
+private:
+
+  enum { Arg0_ExecSpace = Impl::is_execution_space< Arg0 >::value };
+  enum { Arg1_Void      = Impl::is_same< Arg1 , void >::value };
+  enum { ArgOption_OK   = Impl::StaticAssert< ( Arg0_ExecSpace || Arg1_Void ) >::value };
+
+  typedef typename Impl::if_c< Arg0_ExecSpace , Arg1 , Arg0 >::type WorkTag ;
+
+public:
+
+  //! Tag this class as an execution policy
+  typedef TeamPolicy  execution_policy ;
+  typedef ExecSpace   execution_space ;
+  typedef WorkTag     work_tag ;
+
+  //----------------------------------------
+  /** \brief  Query maximum team size for a given functor.
+   *
+   *  This size takes into account execution space concurrency limitations and
+   *  scratch memory space limitations for reductions, team reduce/scan, and
+   *  team shared memory.
+   */
+  template< class FunctorType >
+  static int team_size_max( const FunctorType & );
+
+  /** \brief  Query recommended team size for a given functor.
+   *
+   *  This size takes into account execution space concurrency limitations and
+   *  scratch memory space limitations for reductions, team reduce/scan, and
+   *  team shared memory.
+   */
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & );
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & , const int&);
+  //----------------------------------------
+  /** \brief  Construct policy with the given instance of the execution space */
+  TeamPolicy( const execution_space & , int league_size_request , int team_size_request );
+
+  /** \brief  Construct policy with the default instance of the execution space */
+  TeamPolicy( int league_size_request , int team_size_request );
+
+  /** \brief  The actual league size (number of teams) of the policy.
+   *
+   *  This may be smaller than the requested league size due to limitations
+   *  of the execution space.
+   */
+  KOKKOS_INLINE_FUNCTION int league_size() const ;
+
+  /** \brief  The actual team size (number of threads per team) of the policy.
+   *
+   *  This may be smaller than the requested team size due to limitations
+   *  of the execution space.
+   */
+  KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+  /** \brief  Parallel execution of a functor calls the functor once with
+   *          each member of the execution policy.
+   */
+  struct member_type {
+
+    /** \brief  Handle to the currently executing team shared scratch memory */
+    KOKKOS_INLINE_FUNCTION
+    typename execution_space::scratch_memory_space team_shmem() const ;
+
+    /** \brief  Rank of this team within the league of teams */
+    KOKKOS_INLINE_FUNCTION int league_rank() const ;
+
+    /** \brief  Number of teams in the league */
+    KOKKOS_INLINE_FUNCTION int league_size() const ;
+
+    /** \brief  Rank of this thread within this team */
+    KOKKOS_INLINE_FUNCTION int team_rank() const ;
+
+    /** \brief  Number of threads in this team */
+    KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+    /** \brief  Barrier among the threads of this team */
+    KOKKOS_INLINE_FUNCTION void team_barrier() const ;
+
+    /** \brief  Intra-team reduction. Returns join of all values of the team members. */
+    template< class JoinOp >
+    KOKKOS_INLINE_FUNCTION
+    typename JoinOp::value_type team_reduce( const typename JoinOp::value_type
+                                           , const JoinOp & ) const ;
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+     *
+     *  The highest rank thread can compute the reduction total as
+     *    reduction_total = dev.team_scan( value ) + value ;
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const ;
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+     *          with intra-team non-deterministic ordering accumulation.
+     *
+     *  The global inter-team accumulation value will, at the end of the
+     *  league's parallel execution, be the scan's total.
+     *  Parallel execution ordering of the league's teams is non-deterministic.
+     *  As such the base value for each team's scan operation is similarly
+     *  non-deterministic.
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const ;
+  };
+};
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<typename iType, class TeamMemberType>
+struct TeamThreadRangeBoundariesStruct {
+private:
+
+  KOKKOS_INLINE_FUNCTION static
+  iType ibegin( const iType & arg_begin
+              , const iType & arg_end
+              , const iType & arg_rank
+              , const iType & arg_size
+              )
+    {
+      return arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * arg_rank ;
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  iType iend( const iType & arg_begin
+            , const iType & arg_end
+            , const iType & arg_rank
+            , const iType & arg_size
+            )
+    {
+      const iType end_ = arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * ( arg_rank + 1 );
+      return end_ < arg_end ? end_ : arg_end ;
+    }
+
+public:
+
+  typedef iType index_type;
+  const iType start;
+  const iType end;
+  enum {increment = 1};
+  const TeamMemberType& thread;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
+                                , const iType& arg_end
+                                )
+    : start( ibegin( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , end(   iend(   0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , thread( arg_thread )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
+                                , const iType& arg_begin
+                                , const iType& arg_end
+                                )
+    : start( ibegin( arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , end(   iend(   arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , thread( arg_thread )
+    {}
+};
+
+  template<typename iType, class TeamMemberType>
+  struct ThreadVectorRangeBoundariesStruct {
+    typedef iType index_type;
+    enum {start = 0};
+    const iType end;
+    enum {increment = 1};
+
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const TeamMemberType& thread, const iType& count):
+      end( count )
+    {}
+  };
+
+  template<class TeamMemberType>
+  struct ThreadSingleStruct {
+    const TeamMemberType& team_member;
+    KOKKOS_INLINE_FUNCTION
+    ThreadSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
+  };
+
+  template<class TeamMemberType>
+  struct VectorSingleStruct {
+    const TeamMemberType& team_member;
+    KOKKOS_INLINE_FUNCTION
+    VectorSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
+  };
+} // namespace Impl
+
+/** \brief  Execution policy for parallel work over a threads within a team.
+ *
+ *  The range is split over all threads in a team. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a single count. So the range is (0,count].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& count);
+
+/** \brief  Execution policy for parallel work over a threads within a team.
+ *
+ *  The range is split over all threads in a team. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a begin and end. So the range is (begin,end].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& begin, const iType& end);
+
+/** \brief  Execution policy for a vector parallel loop.
+ *
+ *  The range is split over all vector lanes in a thread. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a single count. So the range is (0,count].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(const TeamMemberType&, const iType& count);
+
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_EXECPOLICY_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@ -0,0 +1,270 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HOSTSPACE_HPP
+#define KOKKOS_HOSTSPACE_HPP
+
+#include <cstring>
+#include <string>
+#include <iosfwd>
+#include <typeinfo>
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#include <impl/Kokkos_AllocationTracker.hpp>
+#include <impl/Kokkos_BasicAllocators.hpp>
+
+#include <impl/KokkosExp_SharedAlloc.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_array_host_space();
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+bool lock_address_host_space(void* ptr);
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+void unlock_address_host_space(void* ptr);
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+/// \class HostSpace
+/// \brief Memory management for host memory.
+///
+/// HostSpace is a memory space that governs host memory.  "Host"
+/// memory means the usual CPU-accessible memory.
+class HostSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef HostSpace  memory_space ;
+  typedef size_t     size_type ;
+
+  /// \typedef execution_space
+  /// \brief Default execution space for this memory space.
+  ///
+  /// Every memory space has a default execution space.  This is
+  /// useful for things like initializing a View (which happens in
+  /// parallel using the View's default execution space).
+#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef Kokkos::OpenMP   execution_space ;
+#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Kokkos::Threads  execution_space ;
+#elif defined( KOKKOS_HAVE_OPENMP )
+  typedef Kokkos::OpenMP   execution_space ;
+#elif defined( KOKKOS_HAVE_PTHREAD )
+  typedef Kokkos::Threads  execution_space ;
+#elif defined( KOKKOS_HAVE_SERIAL )
+  typedef Kokkos::Serial   execution_space ;
+#else
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#endif
+
+  //! This memory space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+
+#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
+  typedef Impl::PageAlignedAllocator allocator ;
+#else
+  typedef Impl::AlignedAllocator allocator ;
+#endif
+
+  /** \brief  Allocate a contiguous block of memory.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   */
+  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
+
+  /*--------------------------------*/
+  /* Functions unique to the HostSpace */
+  static int in_parallel();
+
+  static void register_in_parallel( int (*)() );
+
+  /*--------------------------------*/
+
+  /**\brief  Default memory space instance */
+  HostSpace();
+  HostSpace( const HostSpace & rhs ) = default ;
+  HostSpace & operator = ( const HostSpace & ) = default ;
+  ~HostSpace() = default ;
+
+  /**\brief  Non-default memory space instance to choose allocation mechansim, if available */
+
+  enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
+
+  explicit
+  HostSpace( const AllocationMechanism & );
+
+  /**\brief  Allocate memory in the host space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate memory in the host space */
+  void deallocate( void * const arg_alloc_ptr 
+                 , const size_t arg_alloc_size ) const ;
+
+private:
+
+  AllocationMechanism  m_alloc_mech ;
+
+  friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::HostSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  friend Kokkos::HostSpace ;
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  /**\brief  Root record for tracked allocations from this HostSpace instance */
+  static RecordBase s_root_record ;
+
+  const Kokkos::HostSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() = default ;
+
+  SharedAllocationRecord( const Kokkos::HostSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  inline
+  std::string get_label() const
+    {
+      return std::string( RecordBase::head()->m_label );
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const Kokkos::HostSpace &  arg_space
+                                   , const std::string       &  arg_label
+                                   , const size_t               arg_alloc_size
+                                   )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+#else
+      return (SharedAllocationRecord *) 0 ;
+#endif
+    }
+
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::HostSpace & , bool detail = false );
+};
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class , class > struct DeepCopy ;
+
+template<>
+struct DeepCopy<HostSpace,HostSpace> {
+  DeepCopy( void * dst , const void * src , size_t n );
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+#endif /* #define KOKKOS_HOSTSPACE_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Layout.hpp
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@ -0,0 +1,174 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Layout.hpp
+/// \brief Declaration of various \c MemoryLayout options.
+
+#ifndef KOKKOS_LAYOUT_HPP
+#define KOKKOS_LAYOUT_HPP
+
+#include <stddef.h>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+/// \struct LayoutLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Layout left" indicates a mapping where the leftmost index i0
+/// refers to contiguous access, and strides increase for dimensions
+/// going right from there (i1, i2, ...).  This layout imitates how
+/// Fortran stores multi-dimensional arrays.  For the special case of
+/// a two-dimensional array, "layout left" is also called "column
+/// major."
+struct LayoutLeft {
+  //! Tag this class as a kokkos array layout
+  typedef LayoutLeft array_layout ;
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutRight
+/// \brief Memory layout tag indicating right-to-left (C or
+///   lexigraphical scheme) striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Right layout" indicates a mapping where the rightmost index ik
+/// refers to contiguous access, and strides increase for dimensions
+/// going left from there.  This layout imitates how C stores
+/// multi-dimensional arrays.  For the special case of a
+/// two-dimensional array, "layout right" is also called "row major."
+struct LayoutRight {
+  //! Tag this class as a kokkos array layout
+  typedef LayoutRight array_layout ;
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutStride
+/// \brief  Memory layout tag indicated arbitrarily strided
+///         multi-index mapping into contiguous memory.
+struct LayoutStride {
+
+  //! Tag this class as a kokkos array layout
+  typedef LayoutStride array_layout ;
+
+  enum { MAX_RANK = 8 };
+
+  size_t dimension[ MAX_RANK ] ;
+  size_t stride[ MAX_RANK ] ; 
+
+  /** \brief  Compute strides from ordered dimensions.
+   *
+   *  Values of order uniquely form the set [0..rank)
+   *  and specify ordering of the dimensions.
+   *  Order = {0,1,2,...} is LayoutLeft
+   *  Order = {...,2,1,0} is LayoutRight
+   */
+  template< typename iTypeOrder , typename iTypeDimen >
+  KOKKOS_INLINE_FUNCTION static
+  LayoutStride order_dimensions( int const rank
+                               , iTypeOrder const * const order
+                               , iTypeDimen const * const dimen )
+    {
+      LayoutStride tmp ;
+      // Verify valid rank order:
+      int check_input = MAX_RANK < rank ? 0 : int( 1 << rank ) - 1 ;
+      for ( int r = 0 ; r < MAX_RANK ; ++r ) {
+        tmp.dimension[r] = 0 ;
+        tmp.stride[r]    = 0 ;
+        check_input &= ~int( 1 << order[r] );
+      }
+      if ( 0 == check_input ) {
+        size_t n = 1 ;
+        for ( int r = 0 ; r < rank ; ++r ) {
+          tmp.stride[ order[r] ] = n ;
+          n *= ( dimen[order[r]] );
+          tmp.dimension[r] = dimen[r];
+        }
+      }
+      return tmp ;
+    }
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutTileLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices by tiles.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Tiled layout" indicates a mapping to contiguously stored
+/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
+/// dimensions.  Indices are LayoutLeft within each tile, and the
+/// tiles themselves are arranged using LayoutLeft.  Note that the
+/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be
+/// compile-time constants.  This speeds up index calculations.  If
+/// both tile dimensions are powers of two, Kokkos can optimize
+/// further.
+template < unsigned ArgN0 , unsigned ArgN1 ,
+           bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value &&
+                                 Impl::is_power_of_two<ArgN1>::value )
+         >
+struct LayoutTileLeft {
+  //! Tag this class as a kokkos array layout
+  typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
+
+  enum { N0 = ArgN0 };
+  enum { N1 = ArgN1 };
+};
+
+} // namespace Kokkos
+
+#endif // #ifndef KOKKOS_LAYOUT_HPP
+
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -0,0 +1,397 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MACROS_HPP
+#define KOKKOS_MACROS_HPP
+
+//----------------------------------------------------------------------------
+/** Pick up configure/build options via #define macros:
+ *
+ *  KOKKOS_HAVE_CUDA                Kokkos::Cuda execution and memory spaces
+ *  KOKKOS_HAVE_PTHREAD             Kokkos::Threads execution space
+ *  KOKKOS_HAVE_QTHREAD             Kokkos::Qthread execution space
+ *  KOKKOS_HAVE_OPENMP              Kokkos::OpenMP  execution space
+ *  KOKKOS_HAVE_HWLOC               HWLOC library is available
+ *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK    insert array bounds checks, is expensive!
+ *  KOKKOS_HAVE_CXX11               enable C++11 features
+ *
+ *  KOKKOS_HAVE_MPI                 negotiate MPI/execution space interactions
+ *
+ *  KOKKOS_USE_CUDA_UVM             Use CUDA UVM for Cuda memory space
+ */
+
+#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
+#include <KokkosCore_config.h>
+#endif
+
+//----------------------------------------------------------------------------
+/** Pick up compiler specific #define macros:
+ *
+ *  Macros for known compilers evaluate to an integral version value
+ *
+ *  KOKKOS_COMPILER_NVCC
+ *  KOKKOS_COMPILER_GNU
+ *  KOKKOS_COMPILER_INTEL
+ *  KOKKOS_COMPILER_IBM
+ *  KOKKOS_COMPILER_CRAYC
+ *  KOKKOS_COMPILER_APPLECC
+ *  KOKKOS_COMPILER_CLANG
+ *  KOKKOS_COMPILER_PGI
+ *
+ *  Macros for which compiler extension to use for atomics on intrinsice types
+ *
+ *  KOKKOS_ATOMICS_USE_CUDA
+ *  KOKKOS_ATOMICS_USE_GNU
+ *  KOKKOS_ATOMICS_USE_INTEL
+ *  KOKKOS_ATOMICS_USE_OPENMP31
+ *
+ *  A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
+ *
+ *  Macros for marking functions to run in an execution space:
+ *
+ *  KOKKOS_FUNCTION
+ *  KOKKOS_INLINE_FUNCTION        request compiler to inline
+ *  KOKKOS_FORCEINLINE_FUNCTION   force compiler to inline, use with care!
+ */
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ )
+
+/*  Compiling with a CUDA compiler.
+ *
+ *  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
+ *    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
+ *
+ *  When generating device code the __CUDA_ARCH__ macro is defined as:
+ *    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
+ */
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+
+#if ! defined( CUDA_VERSION )
+#error "#include <cuda.h> did not define CUDA_VERSION"
+#endif
+
+#if ( CUDA_VERSION < 6050 )
+// CUDA supports (inofficially) C++11 in device code starting with
+// version 6.5. This includes auto type and device code internal
+// lambdas.
+#error "Cuda version 6.5 or greater required"
+#endif
+
+#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
+/*  Compiling with CUDA compiler for device code. */
+#error "Cuda device capability >= 3.0 is required"
+#endif
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */
+
+/*--------------------------------------------------------------------------*/
+/* Language info: C++, CUDA, OPENMP */
+
+#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+  // Compiling Cuda code to 'ptx'
+
+  #define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
+  #define KOKKOS_INLINE_FUNCTION       __device__  __host__  inline
+  #define KOKKOS_FUNCTION              __device__  __host__
+
+#endif /* #if defined( __CUDA_ARCH__ ) */
+
+#if defined( _OPENMP )
+
+  /*  Compiling with OpenMP.
+   *  The value of _OPENMP is an integer value YYYYMM
+   *  where YYYY and MM are the year and month designation
+   *  of the supported OpenMP API version.
+   */
+
+#endif /* #if defined( _OPENMP ) */
+
+/*--------------------------------------------------------------------------*/
+/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
+
+#if defined( __NVCC__ )
+  // NVIDIA compiler is being used.
+  // Code is parsed and separated into host and device code.
+  // Host code is compiled again with another compiler.
+  // Device code is compile to 'ptx'.
+  #define KOKKOS_COMPILER_NVCC __NVCC__
+
+#else
+#if defined( KOKKOS_HAVE_CXX11 ) && ! defined( KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA )
+    // CUDA (including version 6.5) does not support giving lambdas as
+    // arguments to global functions. Thus its not currently possible
+    // to dispatch lambdas from the host.
+    #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
+  #endif
+#endif /* #if defined( __NVCC__ ) */
+
+#if defined( KOKKOS_HAVE_CXX11 ) && !defined (KOKKOS_LAMBDA)
+  #define KOKKOS_LAMBDA [=]
+#endif
+
+#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
+
+/* Intel compiler for host code */
+
+#if defined( __INTEL_COMPILER )
+  #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
+#elif defined( __ICC )
+  // Old define
+  #define KOKKOS_COMPILER_INTEL __ICC
+#elif defined( __ECC )
+  // Very old define
+  #define KOKKOS_COMPILER_INTEL __ECC
+#endif
+
+/* CRAY compiler for host code */
+#if defined( _CRAYC )
+  #define KOKKOS_COMPILER_CRAYC _CRAYC
+#endif
+
+#if defined( __IBMCPP__ )
+  // IBM C++
+  #define KOKKOS_COMPILER_IBM __IBMCPP__
+#elif defined( __IBMC__ )
+  #define KOKKOS_COMPILER_IBM __IBMC__
+#endif
+
+#if defined( __APPLE_CC__ )
+  #define KOKKOS_COMPILER_APPLECC __APPLE_CC__
+#endif
+
+#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL)
+  #define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
+#endif
+
+#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
+  #define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
+#endif
+
+#if defined( __PGIC__ ) && ! defined( __GNUC__ )
+  #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
+#endif
+
+#endif /* #if ! defined( __CUDA_ARCH__ ) */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Intel compiler macros */
+
+#if defined( KOKKOS_COMPILER_INTEL )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  #define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  #define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+#if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+
+  #if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #if !defined (_WIN32)
+      #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
+    #else
+      #define KOKKOS_FORCEINLINE_FUNCTION inline
+    #endif
+  #endif
+
+  #if defined( __MIC__ )
+    // Compiling for Xeon Phi
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* Cray compiler macros */
+
+#if defined( KOKKOS_COMPILER_CRAYC )
+
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* IBM Compiler macros */
+
+#if defined( KOKKOS_COMPILER_IBM )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* CLANG compiler macros */
+
+#if defined( KOKKOS_COMPILER_CLANG )
+
+  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* GNU Compiler macros */
+
+#if defined( KOKKOS_COMPILER_GNU )
+
+  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
+  #endif
+
+  #if ! defined( KOKKOS_ENABLE_ASM ) && \
+      ! ( defined( __powerpc) || \
+          defined(__powerpc__) || \
+          defined(__powerpc64__) || \
+          defined(__POWERPC__) || \
+          defined(__ppc__) || \
+          defined(__ppc64__) || \
+          defined(__PGIC__) )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_COMPILER_PGI )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_COMPILER_NVCC )
+
+  #if defined(__CUDA_ARCH__ )
+    #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #endif
+
+#endif
+
+//----------------------------------------------------------------------------
+/** Define function marking macros if compiler specific macros are undefined: */
+
+#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+#define KOKKOS_FORCEINLINE_FUNCTION  inline
+#endif
+
+#if ! defined( KOKKOS_INLINE_FUNCTION )
+#define KOKKOS_INLINE_FUNCTION  inline
+#endif
+
+#if ! defined( KOKKOS_FUNCTION )
+#define KOKKOS_FUNCTION /**/
+#endif
+
+//----------------------------------------------------------------------------
+/** Determine the default execution space for parallel dispatch.
+ *  There is zero or one default execution space specified.
+ */
+
+#if 1 < ( ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
+
+#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
+
+#endif
+
+/** If default is not specified then chose from enabled execution spaces.
+ *  Priority: CUDA, OPENMP, THREADS, SERIAL
+ */
+#if   defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+#elif defined ( KOKKOS_HAVE_CUDA )
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA
+#elif defined ( KOKKOS_HAVE_OPENMP )
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP
+#elif defined ( KOKKOS_HAVE_PTHREAD )
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS
+#else
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL
+#endif
+
+//----------------------------------------------------------------------------
+/** Determine for what space the code is being compiled: */
+
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_HAVE_CUDA)
+#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+#else
+#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_MACROS_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@ -0,0 +1,116 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MEMORYTRAITS_HPP
+#define KOKKOS_MEMORYTRAITS_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Memory access traits for views, an extension point.
+ *
+ *  These traits should be orthogonal.  If there are dependencies then
+ *  the MemoryTraits template must detect and enforce dependencies.
+ *
+ *  A zero value is the default for a View, indicating that none of
+ *  these traits are present.
+ */
+enum MemoryTraitsFlags
+  { Unmanaged  = 0x01
+  , RandomAccess = 0x02
+  , Atomic = 0x04
+  };
+
+template < unsigned T >
+struct MemoryTraits {
+  //! Tag this class as a kokkos memory traits:
+  typedef MemoryTraits memory_traits ;
+
+  enum { Unmanaged    = T & unsigned(Kokkos::Unmanaged) };
+  enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
+  enum { Atomic       = T & unsigned(Kokkos::Atomic) };
+
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+typedef Kokkos::MemoryTraits<0> MemoryManaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryRandomAccess ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief Memory alignment settings
+ *
+ *  Sets global value for memory alignment.  Must be a power of two!
+ *  Enable compatibility of views from different devices with static stride.
+ *  Use compiler flag to enable overwrites.
+ */
+enum { MEMORY_ALIGNMENT =
+#if defined( KOKKOS_MEMORY_ALIGNMENT )
+    ( 1 << Kokkos::Impl::power_of_two< KOKKOS_MEMORY_ALIGNMENT >::value )
+#else
+    ( 1 << Kokkos::Impl::power_of_two< 128 >::value )
+#endif
+  , MEMORY_ALIGNMENT_THRESHOLD = 4 
+  };
+
+
+} //namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@ -0,0 +1,175 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_HPP
+#define KOKKOS_OPENMP_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP )
+
+#include <omp.h>
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class OpenMP
+/// \brief Kokkos device for multicore processors in the host memory space.
+class OpenMP {
+public:
+  //------------------------------------
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef OpenMP                execution_space ;
+  typedef HostSpace             memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef LayoutRight           array_layout ;
+  typedef HostSpace::size_type  size_type ;
+
+  typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
+
+  //@}
+  //------------------------------------
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  inline static bool in_parallel() { return omp_in_parallel(); }
+
+  /** \brief  Set the device in a "sleep" state. A noop for OpenMP.  */
+  static bool sleep();
+
+  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
+  static void fence() {}
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  /// \brief Free any resources being consumed by the device.
+  static void finalize();
+
+  /** \brief  Initialize the device.
+   *
+   *  1) If the hardware locality library is enabled and OpenMP has not
+   *     already bound threads then bind OpenMP threads to maximize
+   *     core utilization and group for memory hierarchy locality.
+   *
+   *  2) Allocate a HostThread for each OpenMP thread to hold its
+   *     topology and fan in/out data.
+   */
+  static void initialize( unsigned thread_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 );
+
+  static int is_initialized();
+  //@}
+  //------------------------------------
+  /** \brief  This execution space has a topological thread pool which can be queried.
+   *
+   *  All threads within a pool have a common memory space for which they are cache coherent.
+   *    depth = 0  gives the number of threads in the whole pool.
+   *    depth = 1  gives the number of threads in a NUMA region, typically sharing L3 cache.
+   *    depth = 2  gives the number of threads at the finest granularity, typically sharing L1 cache.
+   */
+  inline static int thread_pool_size( int depth = 0 );
+
+  /** \brief  The rank of the executing thread in this thread pool */
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
+
+  //------------------------------------
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned hardware_thread_id() { return thread_pool_rank(); }
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::OpenMP::memory_space
+  , Kokkos::OpenMP::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <OpenMP/Kokkos_OpenMPexec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP ) */
+#endif /* #ifndef KOKKOS_OPENMP_HPP */
+
+
--- a/lib/kokkos/core/src/Kokkos_Pair.hpp
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@ -0,0 +1,498 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+/// \file Kokkos_Pair.hpp
+/// \brief Declaration and definition of Kokkos::pair.
+///
+/// This header file declares and defines Kokkos::pair and its related
+/// nonmember functions.
+
+#ifndef KOKKOS_PAIR_HPP
+#define KOKKOS_PAIR_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <utility>
+
+namespace Kokkos {
+/// \struct pair
+/// \brief Replacement for std::pair that works on CUDA devices.
+///
+/// The instance methods of std::pair, including its constructors, are
+/// not marked as <tt>__device__</tt> functions.  Thus, they cannot be
+/// called on a CUDA device, such as an NVIDIA GPU.  This struct
+/// implements the same interface as std::pair, but can be used on a
+/// CUDA device as well as on the host.
+template <class T1, class T2>
+struct pair
+{
+  //! The first template parameter of this class.
+  typedef T1 first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Default constructor.
+  ///
+  /// This calls the default constructors of T1 and T2.  It won't
+  /// compile if those default constructors are not defined and
+  /// public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair()
+    : first(), second()
+  {}
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type const& f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, T2> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1& first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1  first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type const& f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2>
+{
+  //! The first template parameter of this class.
+  typedef T1&  first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+//! Equality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first==rhs.first && lhs.second==rhs.second; }
+
+//! Inequality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs==rhs); }
+
+//! Less-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
+
+//! Less-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(rhs<lhs); }
+
+//! Greater-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return rhs<lhs; }
+
+//! Greater-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs<rhs); }
+
+/// \brief Return a new pair.
+///
+/// This is a "nonmember constructor" for Kokkos::pair.  It works just
+/// like std::make_pair.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1,T2> make_pair (T1 x, T2 y)
+{ return ( pair<T1,T2>(x,y) ); }
+
+/// \brief Return a pair of references to the input arguments.
+///
+/// This compares to std::tie (new in C++11).  You can use it to
+/// assign to two variables at once, from the result of a function
+/// that returns a pair.  For example (<tt>__device__</tt> and
+/// <tt>__host__</tt> attributes omitted for brevity):
+/// \code
+/// // Declaration of the function to call.
+/// // First return value: operation count.
+/// // Second return value: whether all operations succeeded.
+/// Kokkos::pair<int, bool> someFunction ();
+///
+/// // Code that uses Kokkos::tie.
+/// int myFunction () {
+///   int count = 0;
+///   bool success = false;
+///
+///   // This assigns to both count and success.
+///   Kokkos::tie (count, success) = someFunction ();
+///
+///   if (! success) {
+///     // ... Some operation failed;
+///     //     take corrective action ...
+///   }
+///   return count;
+/// }
+/// \endcode
+///
+/// The line that uses tie() could have been written like this:
+/// \code
+///   Kokkos::pair<int, bool> result = someFunction ();
+///   count = result.first;
+///   success = result.second;
+/// \endcode
+///
+/// Using tie() saves two lines of code and avoids a copy of each
+/// element of the pair.  The latter could be significant if one or
+/// both elements of the pair are more substantial objects than \c int
+/// or \c bool.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1 &,T2 &> tie (T1 & x, T2 & y)
+{ return ( pair<T1 &,T2 &>(x,y) ); }
+
+//
+// Specialization of Kokkos::pair for a \c void second argument.  This
+// is not actually a "pair"; it only contains one element, the first.
+//
+template <class T1>
+struct pair<T1,void>
+{
+  typedef T1 first_type;
+  typedef void second_type;
+
+  first_type  first;
+  enum { second = 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair()
+    : first()
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(const first_type & f)
+    : first(f)
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(const first_type & f, int)
+    : first(f)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,void> &p)
+    : first(p.first)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, void> & operator=(const pair<U,void> &p)
+  {
+    first = p.first;
+    return *this;
+  }
+};
+
+//
+// Specialization of relational operators for Kokkos::pair<T1,void>.
+//
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first==rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs==rhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first<rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(rhs<lhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return rhs<lhs; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs<rhs); }
+
+} // namespace Kokkos
+
+
+#endif //KOKKOS_PAIR_HPP
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@ -0,0 +1,908 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Parallel.hpp
+/// \brief Declaration of parallel operators
+
+#ifndef KOKKOS_PARALLEL_HPP
+#define KOKKOS_PARALLEL_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+#include <impl/Kokkos_AllocationTracker.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#ifdef KOKKOS_HAVE_DEBUG
+#include<iostream>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Given a Functor and Execution Policy query an execution space.
+ *
+ *  if       the Policy has an execution space use that
+ *  else if  the Functor has an execution_space use that
+ *  else if  the Functor has a device_type use that for backward compatibility
+ *  else     use the default
+ */
+template< class Functor
+        , class Policy
+        , class EnableFunctor = void
+        , class EnablePolicy  = void
+        >
+struct FunctorPolicyExecutionSpace {
+  typedef Kokkos::DefaultExecutionSpace execution_space ;
+};
+
+template< class Functor , class Policy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::device_type     >::type
+  , typename enable_if_type< typename Policy ::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::execution_space >::type
+  , typename enable_if_type< typename Policy ::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy , class EnableFunctor >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , EnableFunctor
+  , typename enable_if_type< typename Policy::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy , class EnablePolicy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::device_type >::type
+  , EnablePolicy
+  >
+{
+  typedef typename Functor::device_type execution_space ;
+};
+
+template< class Functor , class Policy , class EnablePolicy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::execution_space >::type
+  , EnablePolicy
+  >
+{
+  typedef typename Functor::execution_space execution_space ;
+};
+
+//----------------------------------------------------------------------------
+/// \class ParallelFor
+/// \brief Implementation of the ParallelFor operator that has a
+///   partial specialization for the device.
+///
+/// This is an implementation detail of parallel_for.  Users should
+/// skip this and go directly to the nonmember function parallel_for.
+template< class FunctorType , class ExecPolicy > class ParallelFor ;
+
+/// \class ParallelReduce
+/// \brief Implementation detail of parallel_reduce.
+///
+/// This is an implementation detail of parallel_reduce.  Users should
+/// skip this and go directly to the nonmember function parallel_reduce.
+template< class FunctorType , class ExecPolicy > class ParallelReduce ;
+
+/// \class ParallelScan
+/// \brief Implementation detail of parallel_scan.
+///
+/// This is an implementation detail of parallel_scan.  Users should
+/// skip this and go directly to the documentation of the nonmember
+/// template function Kokkos::parallel_scan.
+template< class FunctorType , class ExecPolicy > class ParallelScan ;
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief Execute \c functor in parallel according to the execution \c policy.
+ *
+ * A "functor" is a class containing the function to execute in parallel,
+ * data needed for that execution, and an optional \c execution_space
+ * typedef.  Here is an example functor for parallel_for:
+ *
+ * \code
+ *  class FunctorType {
+ *  public:
+ *    typedef  ...  execution_space ;
+ *    void operator() ( WorkType iwork ) const ;
+ *  };
+ * \endcode
+ *
+ * In the above example, \c WorkType is any integer type for which a
+ * valid conversion from \c size_t to \c IntType exists.  Its
+ * <tt>operator()</tt> method defines the operation to parallelize,
+ * over the range of integer indices <tt>iwork=[0,work_count-1]</tt>.
+ * This compares to a single iteration \c iwork of a \c for loop.
+ * If \c execution_space is not defined DefaultExecutionSpace will be used.
+ */
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_for( const ExecPolicy  & policy
+                 , const FunctorType & functor
+                 , const std::string& str = ""
+                 , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
+                 )
+{
+#ifdef KOKKOSP_ENABLE_PROFILING
+    uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+     	Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+    (void) Impl::ParallelFor< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy );
+   
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+        Kokkos::Experimental::endParallelFor(kpID);
+     }
+#endif
+}
+
+template< class FunctorType >
+inline
+void parallel_for( const size_t        work_count
+                 , const FunctorType & functor
+                 , const std::string& str = ""
+                 )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+  typedef RangePolicy< execution_space > policy ;
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+  	Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelFor< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelFor(kpID);
+     }
+#endif
+}
+
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_for( const std::string & str
+                 , const ExecPolicy  & policy
+                 , const FunctorType & functor )
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl;
+  #endif
+
+  parallel_for(policy,functor,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_for kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+/** \brief  Parallel reduction
+ *
+ * Example of a parallel_reduce functor for a POD (plain old data) value type:
+ * \code
+ *  class FunctorType { // For POD value type
+ *  public:
+ *    typedef    ...     execution_space ;
+ *    typedef <podType>  value_type ;
+ *    void operator()( <intType> iwork , <podType> & update ) const ;
+ *    void init( <podType> & update ) const ;
+ *    void join( volatile       <podType> & update ,
+ *               volatile const <podType> & input ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> & update ) const ;
+ *  };
+ * \endcode
+ *
+ * Example of a parallel_reduce functor for an array of POD (plain old data) values:
+ * \code
+ *  class FunctorType { // For array of POD value
+ *  public:
+ *    typedef    ...     execution_space ;
+ *    typedef <podType>  value_type[] ;
+ *    void operator()( <intType> , <podType> update[] ) const ;
+ *    void init( <podType> update[] ) const ;
+ *    void join( volatile       <podType> update[] ,
+ *               volatile const <podType> input[] ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> update[] ) const ;
+ *  };
+ * \endcode
+ */
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_reduce( const ExecPolicy  & policy
+                    , const FunctorType & functor
+                    , const std::string& str = ""
+                    , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
+                    )
+{
+  // typedef typename
+  //   Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
+  //     execution_space ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view ;
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+  	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+     (void) Impl::ParallelReduce< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , result_view );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+}
+
+// integral range policy
+template< class FunctorType >
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorType & functor
+                    , const std::string& str = ""
+                    )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef RangePolicy< execution_space > policy ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view ;
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+  	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , result_view );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+
+}
+
+// general policy and view ouput
+template< class ExecPolicy , class FunctorType , class ViewType >
+inline
+void parallel_reduce( const ExecPolicy  & policy
+                    , const FunctorType & functor
+                    , const ViewType    & result_view
+                    , const std::string& str = ""
+                    , typename Impl::enable_if<
+                      ( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
+#ifdef KOKKOS_HAVE_CUDA
+                        && ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
+#endif
+                      )>::type * = 0 )
+{
+    
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+
+}
+
+// general policy and pod or array of pod output
+template< class ExecPolicy , class FunctorType >
+void parallel_reduce( const ExecPolicy  & policy
+                    , const FunctorType & functor
+#ifdef KOKKOS_HAVE_CUDA
+                    , typename Impl::enable_if<
+                      ( ! Impl::is_integral< ExecPolicy >::value &&
+                        ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )
+                      , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref
+                      , const std::string& str = ""
+                      , typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0
+                      )
+#else
+                      , typename Impl::enable_if<
+                        ( ! Impl::is_integral< ExecPolicy >::value)
+                        , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type
+                        >::type result_ref
+                      , const std::string& str = ""
+                        )
+#endif
+{
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , typename ExecPolicy::work_tag >  ValueOps ;
+
+  // Wrap the result output request in a view to inform the implementation
+  // of the type and memory space.
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view( ValueOps::pointer( result_ref )
+               , ValueTraits::value_count( functor )
+               );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+
+}
+
+// integral range policy and view ouput
+template< class FunctorType , class ViewType >
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorType & functor
+                    , const ViewType    & result_view
+                    , const std::string& str = ""
+                    , typename Impl::enable_if<( Impl::is_view<ViewType>::value
+#ifdef KOKKOS_HAVE_CUDA
+                        && ! Impl::is_same<
+                          typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
+                          Kokkos::Cuda>::value
+#endif
+                        )>::type * = 0 )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef RangePolicy< execution_space > ExecPolicy ;
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , ExecPolicy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) );
+    
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+
+}
+
+// integral range policy and pod or array of pod output
+template< class FunctorType >
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorType & functor
+                    , typename Kokkos::Impl::FunctorValueTraits<
+                         typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value ||
+                                             Impl::is_integral<FunctorType>::value,
+                            void,FunctorType>::type
+                         , void >::reference_type result
+                    , const std::string& str = ""
+                    , typename Impl::enable_if< true
+#ifdef KOKKOS_HAVE_CUDA
+                              && ! Impl::is_same<
+                             typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
+                             Kokkos::Cuda>::value
+#endif
+                     >::type * = 0 )
+{
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , void >  ValueOps ;
+
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy< execution_space > policy ;
+
+  // Wrap the result output request in a view to inform the implementation
+  // of the type and memory space.
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view( ValueOps::pointer( result )
+               , ValueTraits::value_count( functor )
+               );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+  (void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+
+}
+
+template< class ExecPolicy , class FunctorType , class ResultType >
+inline
+void parallel_reduce( const std::string & str
+                    , const ExecPolicy  & policy
+                    , const FunctorType & functor
+                    , ResultType * result)
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
+  #endif
+
+  parallel_reduce(policy,functor,result,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+template< class ExecPolicy , class FunctorType , class ResultType >
+inline
+void parallel_reduce( const std::string & str
+                    , const ExecPolicy  & policy
+                    , const FunctorType & functor
+                    , ResultType & result)
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
+  #endif
+
+  parallel_reduce(policy,functor,result,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_reduce( const std::string & str
+                    , const ExecPolicy  & policy
+                    , const FunctorType & functor)
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
+  #endif
+
+  parallel_reduce(policy,functor,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/// \fn parallel_scan
+/// \tparam ExecutionPolicy The execution policy type.
+/// \tparam FunctorType     The scan functor type.
+///
+/// \param policy  [in] The execution policy.
+/// \param functor [in] The scan functor.
+///
+/// This function implements a parallel scan pattern.  The scan can
+/// be either inclusive or exclusive, depending on how you implement
+/// the scan functor.
+///
+/// A scan functor looks almost exactly like a reduce functor, except
+/// that its operator() takes a third \c bool argument, \c final_pass,
+/// which indicates whether this is the last pass of the scan
+/// operation.  We will show below how to use the \c final_pass
+/// argument to control whether the scan is inclusive or exclusive.
+///
+/// Here is the minimum required interface of a scan functor for a POD
+/// (plain old data) value type \c PodType.  That is, the result is a
+/// View of zero or more PodType.  It is also possible for the result
+/// to be an array of (same-sized) arrays of PodType, but we do not
+/// show the required interface for that here.
+/// \code
+/// template< class ExecPolicy , class FunctorType >
+/// class ScanFunctor {
+/// public:
+///   // The Kokkos device type
+///   typedef ... execution_space;
+///   // Type of an entry of the array containing the result;
+///   // also the type of each of the entries combined using
+///   // operator() or join().
+///   typedef PodType value_type;
+///
+///   void operator () (const ExecPolicy::member_type & i, value_type& update, const bool final_pass) const;
+///   void init (value_type& update) const;
+///   void join (volatile value_type& update, volatile const value_type& input) const
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an inclusive plus-scan
+/// of an array of \c int, in place.  If given an array [1, 2, 3, 4], this
+/// scan will overwrite that array with [1, 3, 6, 10].
+///
+/// \code
+/// template<class SpaceType>
+/// class InclScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   InclScanFunctor( Kokkos::View<value_type*, execution_space> x
+///                  , Kokkos::View<value_type*, execution_space> y ) : m_x(x), m_y(y) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     update += m_x(i);
+///     if (final_pass) {
+///       m_y(i) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> m_x;
+///   Kokkos::View<value_type*, execution_space> m_y;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an <i>exclusive</i>
+/// scan of an array of \c int, in place.  In operator(), note both
+/// that the final_pass test and the update have switched places, and
+/// the use of a temporary.  If given an array [1, 2, 3, 4], this scan
+/// will overwrite that array with [0, 1, 3, 6].
+///
+/// \code
+/// template<class SpaceType>
+/// class ExclScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   ExclScanFunctor (Kokkos::View<value_type*, execution_space> x) : x_ (x) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     const value_type x_i = x_(i);
+///     if (final_pass) {
+///       x_(i) = update;
+///     }
+///     update += x_i;
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> x_;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which builds on the above
+/// exclusive scan example, to compute an offsets array from a
+/// population count array, in place.  We assume that the pop count
+/// array has an extra entry at the end to store the final count.  If
+/// given an array [1, 2, 3, 4, 0], this scan will overwrite that
+/// array with [0, 1, 3, 6, 10].
+///
+/// \code
+/// template<class SpaceType>
+/// class OffsetScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   // lastIndex_ is the last valid index (zero-based) of x.
+///   // If x has length zero, then lastIndex_ won't be used anyway.
+///   OffsetScanFunctor( Kokkos::View<value_type*, execution_space> x
+///                    , Kokkos::View<value_type*, execution_space> y )
+///      : m_x(x), m_y(y), last_index_ (x.dimension_0 () == 0 ? 0 : x.dimension_0 () - 1)
+///   {}
+///
+///   void operator () (const size_type i, int& update, const bool final_pass) const {
+///     if (final_pass) {
+///       m_y(i) = update;
+///     }
+///     update += m_x(i);
+///     // The last entry of m_y gets the final sum.
+///     if (final_pass && i == last_index_) {
+///       m_y(i+1) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> m_x;
+///   Kokkos::View<value_type*, execution_space> m_y;
+///   const size_type last_index_;
+/// };
+/// \endcode
+///
+template< class ExecutionPolicy , class FunctorType >
+inline
+void parallel_scan( const ExecutionPolicy & policy
+                  , const FunctorType     & functor
+                  , const std::string& str = ""
+                  , typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
+                  )
+{
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+  Impl::ParallelScan< FunctorType , ExecutionPolicy > scan( Impl::CopyWithoutTracking::apply(functor) , policy );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelScan(kpID);
+     }
+#endif
+
+}
+
+template< class FunctorType >
+inline
+void parallel_scan( const size_t        work_count
+                  , const FunctorType & functor
+                  , const std::string& str = "" )
+{
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy< execution_space > policy ;
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelScan< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelScan(kpID);
+     }
+#endif
+
+}
+
+template< class ExecutionPolicy , class FunctorType >
+inline
+void parallel_scan( const std::string& str
+                  , const ExecutionPolicy & policy
+                  , const FunctorType     & functor)
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
+  #endif
+
+  parallel_scan(policy,functor,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_scan kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Enable = void >
+struct FunctorTeamShmemSize
+{
+  static inline size_t value( const FunctorType & , int ) { return 0 ; }
+};
+
+template< class FunctorType >
+struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f , int team_size ) { return f.team_shmem_size( team_size ) ; }
+};
+
+template< class FunctorType >
+struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f , int team_size ) { return f.shmem_size( team_size ) ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Qthread.hpp
+++ b/lib/kokkos/core/src/Kokkos_Qthread.hpp
@ -0,0 +1,165 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREAD_HPP
+#define KOKKOS_QTHREAD_HPP
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class QthreadExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Execution space supported by Qthread */
+class Qthread {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as an execution space
+  typedef Qthread                  execution_space ;
+  typedef Kokkos::HostSpace        memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef Kokkos::LayoutRight      array_layout ;
+  typedef memory_space::size_type  size_type ;
+
+  typedef ScratchMemorySpace< Qthread > scratch_memory_space ;
+
+  //@}
+  /*------------------------------------------------------------------------*/
+
+  /** \brief  Initialization will construct one or more instances */
+  static Qthread & instance( int = 0 );
+
+  /** \brief  Set the execution space to a "sleep" state.
+   *
+   * This function sets the "sleep" state in which it is not ready for work.
+   * This may consume less resources than in an "ready" state,
+   * but it may also take time to transition to the "ready" state.
+   *
+   * \return True if enters or is in the "sleep" state.
+   *         False if functions are currently executing.
+   */
+  bool sleep();
+
+  /** \brief  Wake from the sleep state.
+   * 
+   *  \return True if enters or is in the "ready" state.
+   *          False if functions are currently executing.
+   */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functions to complete.
+   * 
+   *  The parallel_for or parallel_reduce dispatch of a functor may
+   *  return asynchronously, before the functor completes.  This
+   *  method does not return until all dispatched functors on this
+   *  device have completed.
+   */
+  static void fence();
+
+  /*------------------------------------------------------------------------*/
+
+  static void initialize( int thread_count );
+  static void finalize();
+
+  /** \brief Print configuration information to the given output stream. */
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  int shepherd_size() const ;
+  int shepherd_worker_size() const ;
+};
+
+/*--------------------------------------------------------------------------*/
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Qthread::memory_space
+  , Kokkos::Qthread::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <Kokkos_Parallel.hpp>
+#include <Qthread/Kokkos_QthreadExec.hpp>
+#include <Qthread/Kokkos_Qthread_Parallel.hpp>
+
+#endif /* #define KOKKOS_QTHREAD_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SCRATCHSPACE_HPP
+#define KOKKOS_SCRATCHSPACE_HPP
+
+#include <stdio.h>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Scratch memory space associated with an execution space.
+ *
+ */
+template< class ExecSpace >
+class ScratchMemorySpace {
+public:
+
+  // Alignment of memory chunks returned by 'get'
+  // must be a power of two
+  enum { ALIGN = 8 };
+
+private:
+
+  mutable char * m_iter ;
+  char *         m_end ;
+
+  ScratchMemorySpace();
+  ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
+
+  enum { MASK = ALIGN - 1 }; // Alignment used by View::shmem_size
+
+public:
+
+  //! Tag this class as a memory space
+  typedef ScratchMemorySpace                memory_space ;
+  typedef ExecSpace                         execution_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef typename ExecSpace::array_layout  array_layout ;
+  typedef typename ExecSpace::size_type     size_type ;
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION static
+  IntType align( const IntType & size )
+    { return ( size + MASK ) & ~MASK ; }
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION
+  void* get_shmem (const IntType& size) const {
+    void* tmp = m_iter ;
+    if (m_end < (m_iter += align (size))) {
+      m_iter -= align (size); // put it back like it was
+  #ifdef KOKKOS_HAVE_DEBUG
+      // mfh 23 Jun 2015: printf call consumes 25 registers
+      // in a CUDA build, so only print in debug mode.  The
+      // function still returns NULL if not enough memory.
+      printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+              "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+              long(m_end-m_iter));
+  #endif // KOKKOS_HAVE_DEBUG
+      tmp = 0;
+    }
+    return tmp;
+  }
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION
+  ScratchMemorySpace( void * ptr , const IntType & size )
+    : m_iter( (char *) ptr )
+    , m_end(  m_iter + size )
+    {}
+};
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -0,0 +1,892 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Serial.hpp
+/// \brief Declaration and definition of Kokkos::Serial device.
+
+#ifndef KOKKOS_SERIAL_HPP
+#define KOKKOS_SERIAL_HPP
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL )
+
+namespace Kokkos {
+
+/// \class Serial
+/// \brief Kokkos device for non-parallel execution
+///
+/// A "device" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads device uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
+/// extensions, and the Cuda device uses NVIDIA's CUDA programming
+/// model.  The Serial device executes "parallel" kernels
+/// sequentially.  This is useful if you really do not want to use
+/// threads, or if you want to explore different combinations of MPI
+/// and shared-memory parallel programming models.
+class Serial {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as an execution space:
+  typedef Serial                execution_space ;
+  //! The size_type typedef best suited for this device.
+  typedef HostSpace::size_type  size_type ;
+  //! This device's preferred memory space.
+  typedef HostSpace             memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  //! This device's preferred array layout.
+  typedef LayoutRight           array_layout ;
+
+  /// \brief  Scratch memory space
+  typedef ScratchMemorySpace< Kokkos::Serial >  scratch_memory_space ;
+
+  //@}
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  ///
+  /// For the Serial device, this method <i>always</i> returns false,
+  /// because parallel_for or parallel_reduce with the Serial device
+  /// always execute sequentially.
+  inline static int in_parallel() { return false ; }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence() {}
+
+  static void initialize( unsigned threads_count = 1 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false) {
+    (void) threads_count;
+    (void) use_numa_count;
+    (void) use_cores_per_numa;
+    (void) allow_asynchronous_threadpool;
+
+    // Init the array of locks used for arbitrarily sized atomics
+    Impl::init_lock_array_host_space();
+
+  }
+
+  static int is_initialized() { return 1 ; }
+
+  //! Free any resources being consumed by the device.
+  static void finalize() {}
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false ) {}
+
+  //--------------------------------------------------------------------------
+
+  inline static int thread_pool_size( int = 0 ) { return 1 ; }
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+
+  //--------------------------------------------------------------------------
+
+  static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
+
+  //--------------------------------------------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Serial::memory_space
+  , Kokkos::Serial::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+namespace SerialImpl {
+
+struct Sentinel {
+
+  void *   m_scratch ;
+  unsigned m_reduce_end ;
+  unsigned m_shared_end ;
+
+  Sentinel();
+  ~Sentinel();
+  static Sentinel & singleton();
+};
+
+inline
+unsigned align( unsigned n );
+}
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+class SerialTeamMember {
+private:
+  typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
+  const scratch_memory_space  m_space ;
+  const int                   m_league_rank ;
+  const int                   m_league_size ;
+
+  SerialTeamMember & operator = ( const SerialTeamMember & );
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_space ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(const ValueType& , const int& ) const {}
+
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION
+  ValueType team_reduce( const ValueType & value , const JoinOp & ) const
+    {
+      return value ;
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
+    {
+      const Type tmp = global_accum ? *global_accum : Type(0) ;
+      if ( global_accum ) { *global_accum += value ; }
+      return tmp ;
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
+    { return Type(0); }
+
+  //----------------------------------------
+  // Execution space specific:
+
+  SerialTeamMember( int arg_league_rank
+                  , int arg_league_size
+                  , int arg_shared_size
+                  );
+};
+
+} // namespace Impl
+
+
+/*
+ * < Kokkos::Serial , WorkArgTag >
+ * < WorkArgTag , Impl::enable_if< Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
+ *
+ */
+template< class Arg0 , class Arg1 >
+class TeamPolicy< Arg0 , Arg1 , Kokkos::Serial >
+{
+private:
+
+  const int m_league_size ;
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicy      execution_policy ;
+
+  //! Execution space of this execution policy:
+  typedef Kokkos::Serial  execution_space ;
+
+  typedef typename
+    Impl::if_c< ! Impl::is_same< Kokkos::Serial , Arg0 >::value , Arg0 , Arg1 >::type
+      work_tag ;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  static
+  int team_size_max( const FunctorType & ) { return 1 ; }
+
+  template< class FunctorType >
+  static
+  int team_size_recommended( const FunctorType & ) { return 1 ; }
+
+  template< class FunctorType >
+  static
+  int team_size_recommended( const FunctorType & , const int& ) { return 1 ; }
+
+  //----------------------------------------
+
+  inline int team_size() const { return 1 ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size_request , int /* team_size_request */  , int vector_length_request = 1 )
+    : m_league_size( league_size_request )
+    { (void) vector_length_request; }
+
+  TeamPolicy( int league_size_request , int /* team_size_request */ , int vector_length_request = 1 )
+    : m_league_size( league_size_request )
+    { (void) vector_length_request; }
+
+  typedef Impl::SerialTeamMember  member_type ;
+};
+
+} /* namespace Kokkos */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
+
+public:
+  // work tag is void
+  template< class PType >
+  inline
+  ParallelFor( typename Impl::enable_if<
+                 ( Impl::is_same< PType , Policy >::value &&
+                   Impl::is_same< typename PType::work_tag , void >::value
+                 ), const FunctorType & >::type functor
+             , const PType & policy )
+    {
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( i );
+      }
+    }
+
+  // work tag is non-void
+  template< class PType >
+  inline
+  ParallelFor( typename Impl::enable_if<
+                 ( Impl::is_same< PType , Policy >::value &&
+                   ! Impl::is_same< typename PType::work_tag , void >::value
+                 ), const FunctorType & >::type functor
+             , const PType & policy )
+    {
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i );
+      }
+    }
+};
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
+{
+public:
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  // Work tag is void
+  template< class ViewType , class PType >
+  ParallelReduce( typename Impl::enable_if<
+                    ( Impl::is_view< ViewType >::value &&
+                      Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
+                      Impl::is_same< PType , Policy >::value &&
+                      Impl::is_same< typename PType::work_tag , void >::value
+                    ), const FunctorType & >::type functor
+                , const PType     & policy
+                , const ViewType  & result
+                )
+    {
+      pointer_type result_ptr = result.ptr_on_device();
+
+      if ( ! result_ptr ) {
+        result_ptr = (pointer_type)
+          Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
+      }
+
+      reference_type update = ValueInit::init( functor , result_ptr );
+
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( i , update );
+      }
+
+      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
+    }
+
+  // Work tag is non-void
+  template< class ViewType , class PType >
+  ParallelReduce( typename Impl::enable_if<
+                    ( Impl::is_view< ViewType >::value &&
+                      Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
+                      Impl::is_same< PType , Policy >::value &&
+                      ! Impl::is_same< typename PType::work_tag , void >::value
+                    ), const FunctorType & >::type functor
+                , const PType     & policy
+                , const ViewType  & result
+                )
+    {
+      pointer_type result_ptr = result.ptr_on_device();
+
+      if ( ! result_ptr ) {
+        result_ptr = (pointer_type)
+          Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
+      }
+
+      typename ValueTraits::reference_type update = ValueInit::init( functor , result_ptr );
+
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update );
+      }
+
+      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
+    }
+};
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  // work tag is void
+  template< class PType >
+  inline
+  ParallelScan( typename Impl::enable_if<
+                 ( Impl::is_same< PType , Policy >::value &&
+                   Impl::is_same< typename PType::work_tag , void >::value
+                 ), const FunctorType & >::type functor
+             , const PType & policy )
+    {
+      pointer_type result_ptr = (pointer_type)
+        Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
+
+      reference_type update = ValueInit::init( functor , result_ptr );
+
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( i , update , true );
+      }
+
+      Kokkos::Impl::FunctorFinal<  FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
+    }
+
+  // work tag is non-void
+  template< class PType >
+  inline
+  ParallelScan( typename Impl::enable_if<
+                 ( Impl::is_same< PType , Policy >::value &&
+                   ! Impl::is_same< typename PType::work_tag , void >::value
+                 ), const FunctorType & >::type functor
+             , const PType & policy )
+    {
+      pointer_type result_ptr = (pointer_type)
+        Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
+
+      reference_type update = ValueInit::init( functor , result_ptr );
+
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update , true );
+      }
+
+      Kokkos::Impl::FunctorFinal<  FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
+{
+private:
+
+  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type & member )
+    { functor( member ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type & member )
+    { functor( TagType() , member ); }
+
+public:
+
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy )
+    {
+      const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+      Kokkos::Serial::scratch_memory_resize( 0 , shared_size );
+
+      for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
+        ParallelFor::template driver< typename Policy::work_tag >
+          ( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) );
+        // functor( typename Policy::member_type(ileague,policy.league_size(),shared_size) );
+      }
+    }
+};
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
+{
+private:
+
+  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag >  ValueInit ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+private:
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type  & member
+             ,       reference_type                  update )
+    { functor( member , update ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type  & member
+             ,       reference_type                  update )
+    { functor( TagType() , member , update ); }
+
+public:
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType  & functor
+                , const Policy       & policy
+                , const ViewType     & result
+                )
+    {
+      const int reduce_size = ValueTraits::value_size( functor );
+      const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+      void * const scratch_reduce = Kokkos::Serial::scratch_memory_resize( reduce_size , shared_size );
+
+      const pointer_type result_ptr =
+        result.ptr_on_device() ? result.ptr_on_device()
+                               : (pointer_type) scratch_reduce ;
+
+      reference_type update = ValueInit::init( functor , result_ptr );
+
+      for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
+        ParallelReduce::template driver< typename Policy::work_tag >
+          ( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) , update );
+      }
+
+      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType,SerialTeamMember> {
+  typedef iType index_type;
+  const iType begin ;
+  const iType end ;
+  enum {increment = 1};
+  const SerialTeamMember& thread;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_count)
+    : begin(0)
+    , end(arg_count)
+    , thread(arg_thread)
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_begin, const iType & arg_end )
+    : begin( arg_begin )
+    , end(   arg_end)
+    , thread( arg_thread )
+    {}
+};
+
+  template<typename iType>
+  struct ThreadVectorRangeBoundariesStruct<iType,SerialTeamMember> {
+    typedef iType index_type;
+    enum {start = 0};
+    const iType end;
+    enum {increment = 1};
+
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const SerialTeamMember& thread, const iType& count):
+      end( count )
+    {}
+  };
+
+} // namespace Impl
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
+TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
+TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & begin , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >
+  ThreadVectorRange(const Impl::SerialTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread);
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+#ifdef KOKKOS_HAVE_CXX11
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+#endif // KOKKOS_HAVE_CXX11
+
+} //namespace Kokkos
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+}
+
+#endif // defined( KOKKOS_HAVE_SERIAL )
+#endif /* #define KOKKOS_SERIAL_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
@ -0,0 +1,376 @@
+
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_TASKPOLICY_HPP
+#define KOKKOS_TASKPOLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct FutureValueTypeIsVoidError {};
+
+template < class ExecSpace , class ResultType , class FunctorType >
+class TaskMember ;
+
+template< class ExecPolicy , class ResultType , class FunctorType >
+class TaskForEach ;
+
+template< class ExecPolicy , class ResultType , class FunctorType >
+class TaskReduce ;
+
+template< class ExecPolicy , class ResultType , class FunctorType >
+struct TaskScan ;
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/**\brief  States of a task */
+enum TaskState
+  { TASK_STATE_NULL         = 0  ///<  Does not exist
+  , TASK_STATE_CONSTRUCTING = 1  ///<  Is under construction
+  , TASK_STATE_WAITING      = 2  ///<  Is waiting for execution
+  , TASK_STATE_EXECUTING    = 4  ///<  Is executing
+  , TASK_STATE_COMPLETE     = 8  ///<  Execution is complete
+  };
+
+/**
+ *
+ *  Future< space >  // value_type == void
+ *  Future< value >  // space == Default
+ *  Future< value , space >
+ *
+ */
+template< class Arg1 = void , class Arg2 = void >
+class Future {
+private:
+
+  template< class , class , class > friend class Impl::TaskMember ;
+  template< class > friend class TaskPolicy ;
+  template< class , class > friend class Future ;
+
+  // Argument #2, if not void, must be the space.
+  enum { Arg1_is_space  = Kokkos::Impl::is_execution_space< Arg1 >::value };
+  enum { Arg2_is_space  = Kokkos::Impl::is_execution_space< Arg2 >::value };
+  enum { Arg2_is_void   = Kokkos::Impl::is_same< Arg2 , void >::value };
+
+  struct ErrorNoExecutionSpace {};
+
+  enum { Opt1  =   Arg1_is_space && Arg2_is_void
+       , Opt2  = ! Arg1_is_space && Arg2_is_void
+       , Opt3  = ! Arg1_is_space && Arg2_is_space
+       , OptOK = Kokkos::Impl::StaticAssert< Opt1 || Opt2 || Opt3 , ErrorNoExecutionSpace >::value
+       };
+
+  typedef typename
+    Kokkos::Impl::if_c< Opt2 || Opt3 , Arg1 , void >::type
+      ValueType ;
+
+  typedef typename
+    Kokkos::Impl::if_c< Opt1 , Arg1 , typename
+    Kokkos::Impl::if_c< Opt2 , Kokkos::DefaultExecutionSpace , typename
+    Kokkos::Impl::if_c< Opt3 , Arg2 , void
+    >::type >::type >::type
+      ExecutionSpace ;
+
+  typedef Impl::TaskMember< ExecutionSpace , void , void >       TaskRoot ;
+  typedef Impl::TaskMember< ExecutionSpace , ValueType , void >  TaskValue ;
+
+  TaskRoot * m_task ;
+
+public:
+
+  typedef ValueType       value_type;
+  typedef ExecutionSpace  execution_space ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  TaskState get_task_state() const
+    { return 0 != m_task ? m_task->get_state() : TASK_STATE_NULL ; }
+
+  //----------------------------------------
+
+  explicit
+  Future( TaskRoot * task )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( task ) ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  ~Future() { TaskRoot::assign( & m_task , 0 ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  Future() : m_task(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future & rhs )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , rhs.m_task ); }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future & rhs )
+    { TaskRoot::assign( & m_task , rhs.m_task ); return *this ; }
+
+  //----------------------------------------
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future<A1,A2> & rhs )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future<A1,A2> & rhs )
+    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); return *this ; }
+
+  //----------------------------------------
+
+  typedef typename TaskValue::get_result_type get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const
+    { return static_cast<TaskValue*>( m_task )->get(); }
+};
+
+namespace Impl {
+
+template< class T >
+struct is_future : public Kokkos::Impl::bool_< false > {};
+
+template< class Arg0 , class Arg1 >
+struct is_future< Kokkos::Experimental::Future<Arg0,Arg1> > : public Kokkos::Impl::bool_< true > {};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \brief  If the argument is an execution space then a serial task in that space */
+template< class Arg0 = Kokkos::DefaultExecutionSpace >
+class TaskPolicy {
+public:
+
+  typedef typename Arg0::execution_space  execution_space ;
+
+  //----------------------------------------
+  /** \brief  Create a serial task with storage for dependences.
+   *
+   *  Postcondition: Task is in the 'constructing' state.
+   */
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create( const FunctorType & functor
+        , const unsigned      dependence_capacity /* = default */ ) const ;
+
+  /** \brief  Create a foreach task with storage for dependences. */
+  template< class ExecPolicy , class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create_foreach( const ExecPolicy  & policy
+                , const FunctorType & functor
+                , const unsigned      dependence_capacity /* = default */ ) const ;
+
+  /** \brief  Create a reduce task with storage for dependences. */
+  template< class ExecPolicy , class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create_reduce( const ExecPolicy  & policy
+               , const FunctorType & functor
+               , const unsigned      dependence_capacity /* = default */ ) const ;
+
+  /** \brief  Create a scan task with storage for dependences. */
+  template< class ExecPolicy , class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create_scan( const ExecPolicy  & policy
+             , const FunctorType & functor
+             , const unsigned      dependence_capacity /* = default */ ) const ;
+
+  /** \brief  Set dependence that 'after' cannot start execution
+   *          until 'before' has completed.
+   *
+   *  Precondition: The 'after' task must be in then 'Constructing' state.
+   */
+  template< class TA , class TB >
+  void set_dependence( const Future<TA,execution_space> & after
+                     , const Future<TB,execution_space> & before ) const ;
+
+  /** \brief  Spawn a task in the 'Constructing' state
+   *
+   *  Precondition:  Task is in the 'constructing' state.
+   *  Postcondition: Task is waiting, executing, or complete.
+   */
+  template< class T >
+  const Future<T,execution_space> &
+  spawn( const Future<T,execution_space> & ) const ;
+
+  //----------------------------------------
+  /** \brief  Query dependence of an executing task */
+
+  template< class FunctorType >
+  Future< execution_space >
+  get_dependence( FunctorType * , const int ) const ;
+
+  //----------------------------------------
+  /** \brief  Clear current dependences of an executing task
+   *          in preparation for setting new dependences and
+   *          respawning.
+   *
+   * Precondition: The functor must be a task in the executing state.
+   */
+  template< class FunctorType >
+  void clear_dependence( FunctorType * ) const ;
+
+  /** \brief  Set dependence that 'after' cannot start execution
+   *          until 'before' has completed.
+   *
+   *  The 'after' functor must be in the executing state
+   */
+  template< class FunctorType , class TB >
+  void set_dependence( FunctorType * after
+                     , const Future<TB,execution_space> & before ) const ;
+
+  /** \brief  Respawn (reschedule) an executing task to be called again
+   *          after all dependences have completed.
+   */
+  template< class FunctorType >
+  void respawn( FunctorType * ) const ;
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Create and spawn a single-thread task */
+template< class ExecSpace , class FunctorType >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn( TaskPolicy<ExecSpace> & policy , const FunctorType & functor )
+{ return policy.spawn( policy.create( functor ) ); }
+
+/** \brief  Create and spawn a single-thread task with dependences */
+template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn( TaskPolicy<ExecSpace>   & policy
+     , const FunctorType       & functor
+     , const Future<Arg0,Arg1> & before_0
+     , const Future<Arg0,Arg1> & before_1 )
+{
+  Future< typename FunctorType::value_type , ExecSpace > f ;
+  f = policy.create( functor , 2 );
+  policy.add_dependence( f , before_0 );
+  policy.add_dependence( f , before_1 );
+  policy.spawn( f );
+  return f ;
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Create and spawn a parallel_for task */
+template< class ExecSpace , class ParallelPolicyType , class FunctorType >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn_foreach( TaskPolicy<ExecSpace>     & task_policy
+             , const ParallelPolicyType  & parallel_policy
+             , const FunctorType         & functor )
+{ return task_policy.spawn( task_policy.create_foreach( parallel_policy , functor ) ); }
+
+/** \brief  Create and spawn a parallel_reduce task */
+template< class ExecSpace , class ParallelPolicyType , class FunctorType >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn_reduce( TaskPolicy<ExecSpace>     & task_policy
+            , const ParallelPolicyType  & parallel_policy
+            , const FunctorType         & functor )
+{ return task_policy.spawn( task_policy.create_reduce( parallel_policy , functor ) ); }
+
+//----------------------------------------------------------------------------
+/** \brief  Respawn a task functor with dependences */
+template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 >
+inline
+void respawn( TaskPolicy<ExecSpace>   & policy
+            , FunctorType *             functor
+            , const Future<Arg0,Arg1> & before_0
+            , const Future<Arg0,Arg1> & before_1
+            )
+{
+  policy.clear_dependence( functor );
+  policy.add_dependence( functor , before_0 );
+  policy.add_dependence( functor , before_1 );
+  policy.respawn( functor );
+}
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+void wait( TaskPolicy< ExecSpace > & );
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_TASKPOLICY_HPP */
+
--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@ -0,0 +1,217 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_HPP
+#define KOKKOS_THREADS_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class ThreadsExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Execution space for a pool of Pthreads or C11 threads on a CPU. */
+class Threads {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+  //! Tag this class as a kokkos execution space
+  typedef Threads                  execution_space ;
+  typedef Kokkos::HostSpace        memory_space ;
+
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef Kokkos::LayoutRight      array_layout ;
+  typedef memory_space::size_type  size_type ;
+
+  typedef ScratchMemorySpace< Threads >  scratch_memory_space ;
+
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  //! \name Static functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  static int in_parallel();
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  /// \brief Free any resources being consumed by the device.
+  ///
+  /// For the Threads device, this terminates spawned worker threads.
+  static void finalize();
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  /*------------------------------------------------------------------------*/
+  //! \name Space-specific functions
+  //@{
+
+  /** \brief Initialize the device in the "ready to work" state.
+   *
+   *  The device is initialized in a "ready to work" or "awake" state.
+   *  This state reduces latency and thus improves performance when
+   *  dispatching work.  However, the "awake" state consumes resources
+   *  even when no work is being done.  You may call sleep() to put
+   *  the device in a "sleeping" state that does not consume as many
+   *  resources, but it will take time (latency) to awaken the device
+   *  again (via the wake()) method so that it is ready for work.
+   *
+   *  Teams of threads are distributed as evenly as possible across
+   *  the requested number of numa regions and cores per numa region.
+   *  A team will not be split across a numa region.
+   *
+   *  If the 'use_' arguments are not supplied the hwloc is queried
+   *  to use all available cores.
+   */
+  static void initialize( unsigned threads_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false );
+
+  static int is_initialized();
+
+  static Threads & instance( int = 0 );
+
+  //----------------------------------------
+
+  static int thread_pool_size( int depth = 0 );
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static int thread_pool_rank();
+#else
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
+#endif
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
+
+  //@}
+  //----------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Threads::memory_space
+  , Kokkos::Threads::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Threads/Kokkos_ThreadsExec.hpp>
+#include <Threads/Kokkos_ThreadsTeam.hpp>
+#include <Threads/Kokkos_Threads_Parallel.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+#endif /* #define KOKKOS_THREADS_HPP */
+
+
--- a/lib/kokkos/core/src/Kokkos_Vectorization.hpp
+++ b/lib/kokkos/core/src/Kokkos_Vectorization.hpp
@ -0,0 +1,53 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Vectorization.hpp
+/// \brief Declaration and definition of Kokkos::Vectorization interface.
+#ifndef KOKKOS_VECTORIZATION_HPP
+#define KOKKOS_VECTORIZATION_HPP
+
+#if defined( KOKKOS_HAVE_CUDA )
+#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+#endif
+
+#endif
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
--- a/lib/kokkos/core/src/Kokkos_hwloc.hpp
+++ b/lib/kokkos/core/src/Kokkos_hwloc.hpp
@ -0,0 +1,140 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HWLOC_HPP
+#define KOKKOS_HWLOC_HPP
+
+#include <utility>
+
+namespace Kokkos {
+
+/** \brief  Minimal subset of logical 'hwloc' functionality available
+ *          from http://www.open-mpi.org/projects/hwloc/.
+ *
+ *  The calls are NOT thread safe in order to avoid mutexes,
+ *  memory allocations, or other actions which could give the
+ *  runtime system an opportunity to migrate the threads or
+ *  touch allocated memory during the function calls.
+ *
+ *  All calls to these functions should be performed by a thread
+ *  when it has guaranteed exclusive access; e.g., for OpenMP
+ *  within a 'critical' region.
+ */
+namespace hwloc {
+
+/** \brief  Query if hwloc is available */
+bool available();
+
+/** \brief  Query number of available NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a NUMA region.
+ */
+unsigned get_available_numa_count();
+
+/** \brief  Query number of available cores per NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a set of cores.
+ */
+unsigned get_available_cores_per_numa();
+
+/** \brief  Query number of available "hard" threads per core; i.e., hyperthreads */
+unsigned get_available_threads_per_core();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Internal functions for binding persistent spawned threads.
+
+namespace Kokkos {
+namespace hwloc {
+
+/** \brief  Recommend mapping of threads onto cores.
+ *
+ * If thread_count == 0 then choose and set a value.
+ * If use_numa_count == 0 then choose and set a value.
+ * If use_cores_per_numa == 0 then choose and set a value.
+ *
+ * Return 0 if asynchronous,
+ * Return 1 if synchronous and threads_coord[0] is process core
+ */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] );
+
+/** \brief  Query core-coordinate of the current thread
+ *          with respect to the core_topology.
+ *
+ *  As long as the thread is running within the 
+ *  process binding the following condition holds.
+ *
+ *  core_coordinate.first  < core_topology.first
+ *  core_coordinate.second < core_topology.second
+ */
+std::pair<unsigned,unsigned> get_this_thread_coordinate();
+
+/** \brief  Bind the current thread to a core. */
+bool bind_this_thread( const std::pair<unsigned,unsigned> );
+
+/** \brief  Bind the current thread to one of the cores in the list.
+ *          Set that entry to (~0,~0) and return the index.
+ *          If binding fails return ~0.
+ */
+unsigned bind_this_thread( const unsigned               coordinate_count ,
+                           std::pair<unsigned,unsigned> coordinate[] );
+
+/** \brief  Unbind the current thread back to the original process binding */
+bool unbind_this_thread();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_HWLOC_HPP */
+
--- a/lib/kokkos/core/src/Makefile
+++ b/lib/kokkos/core/src/Makefile
@ -0,0 +1,118 @@
+KOKKOS_PATH = ../..
+
+PREFIX ?= /usr/local/lib/kokkos
+
+default: messages build-lib
+	echo "End Build"
+	
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = nvcc_wrapper
+	CXXFLAGS ?= -O3
+	LINK = nvcc_wrapper
+	LINKFLAGS ?= 
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= g++
+	LINKFLAGS ?=  
+endif
+
+PWD = $(shell pwd)
+
+KOKKOS_HEADERS_INCLUDE = $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
+KOKKOS_HEADERS_INCLUDE_IMPL = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
+KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
+KOKKOS_HEADERS_INCLUDE_IMPL += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp)
+KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
+
+CONDITIONAL_COPIES =
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+	CONDITIONAL_COPIES += copy-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+	CONDITIONAL_COPIES += copy-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+	CONDITIONAL_COPIES += copy-openmp
+endif
+
+messages: 
+	echo "Start Build"
+
+build-makefile-kokkos:
+	rm -f Makefile.kokkos
+	echo "#Global Settings used to generate this library" >> Makefile.kokkos
+	echo "KOKKOS_PATH = $(PREFIX)" >> Makefile.kokkos
+	echo "KOKKOS_DEVICES = $(KOKKOS_DEVICES)" >> Makefile.kokkos
+	echo "KOKKOS_ARCH = $(KOKKOS_ARCH)" >> Makefile.kokkos
+	echo "KOKKOS_DEBUG = $(KOKKOS_DEBUG)" >> Makefile.kokkos
+	echo "KOKKOS_USE_TPLS = $(KOKKOS_USE_TPLS)" >> Makefile.kokkos
+	echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos
+	echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos
+	echo "CXX ?= $(CXX)" >> Makefile.kokkos 
+	echo "" >> Makefile.kokkos  
+	echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos
+	echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos
+	echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
+	echo "" >> Makefile.kokkos  
+	echo "#Variables used in application Makefiles" >> Makefile.kokkos
+	echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
+	echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
+	echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos
+	echo "KOKKOS_LINK_DEPENDS  = $(KOKKOS_LINK_DEPENDS)" >> Makefile.kokkos
+	echo "KOKKOS_LIBS = $(KOKKOS_LIBS)" >> Makefile.kokkos
+	echo "KOKKOS_LDFLAGS = $(KOKKOS_LDFLAGS)" >> Makefile.kokkos
+	sed \
+		-e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \
+		-e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \
+		-e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \
+		-e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \
+		-e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \
+		-e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' Makefile.kokkos \
+		> Makefile.kokkos.tmp
+	mv -f Makefile.kokkos.tmp Makefile.kokkos
+
+build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
+
+mkdir: 
+	mkdir -p $(PREFIX)
+	mkdir -p $(PREFIX)/include
+	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/include/impl
+
+copy-cuda: mkdir
+	mkdir -p $(PREFIX)/include/Cuda
+	cp $(KOKKOS_HEADERS_CUDA) $(PREFIX)/include/Cuda
+	
+copy-threads: mkdir
+	mkdir -p $(PREFIX)/include/Threads
+	cp $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
+
+copy-openmp: mkdir
+	mkdir -p $(PREFIX)/include/OpenMP
+	cp $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP
+
+install: mkdir $(CONDITIONAL_COPIES) build-lib 
+	cp $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
+	cp $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
+	cp Makefile.kokkos $(PREFIX)
+	cp libkokkos.a $(PREFIX)/lib
+	cp KokkosCore_config.h $(PREFIX)/include
+
+  
+
+clean: kokkos-clean
+	rm Makefile.kokkos
+
+
+
+
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@ -0,0 +1,496 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_PARALLEL_HPP
+#define KOKKOS_OPENMP_PARALLEL_HPP
+
+#include <omp.h>
+
+#include <Kokkos_Parallel.hpp>
+#include <OpenMP/Kokkos_OpenMPexec.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( iwork );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( typename PType::work_tag() , iwork );
+      }
+    }
+
+public:
+
+  inline
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy )
+    {
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+#pragma omp parallel
+      {
+        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        driver( functor , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() ) );
+      }
+/* END #pragma omp parallel */
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( iwork , update );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( typename PType::work_tag() , iwork , update );
+      }
+    }
+
+public:
+
+  //----------------------------------------
+
+  template< class ViewType >
+  inline
+  ParallelReduce( typename Impl::enable_if<
+                    ( Impl::is_view< ViewType >::value &&
+                      Impl::is_same< typename ViewType::memory_space , HostSpace >::value
+                    ), const FunctorType & >::type functor
+                , const Policy    & policy
+                , const ViewType  & result_view )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+
+    OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , 0 );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      driver( functor
+            , ValueInit::init( functor , exec.scratch_reduce() )
+            , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
+            );
+    }
+/* END #pragma omp parallel */
+
+    {
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  FunctorType , WorkTag >::final( functor , ptr );
+
+      if ( result_view.ptr_on_device() ) {
+        const int n = ValueTraits::value_count( functor );
+
+        for ( int j = 0 ; j < n ; ++j ) { result_view.ptr_on_device()[j] = ptr[j] ; }
+      }
+    }
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , WorkTag >  ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range
+             , const bool    final )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( iwork , update , final );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range
+             , const bool    final )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( typename PType::work_tag() , iwork , update , final );
+      }
+    }
+
+public:
+
+  //----------------------------------------
+
+  inline
+  ParallelScan( const FunctorType & functor
+              , const Policy      & policy )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
+
+    OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( functor ) , 0 );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      driver( functor
+            , ValueInit::init( functor , pointer_type( exec.scratch_reduce() ) + ValueTraits::value_count( functor ) )
+            , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
+            , false );
+    }
+/* END #pragma omp parallel */
+
+    {
+      const unsigned thread_count = OpenMPexec::pool_size();
+      const unsigned value_count  = ValueTraits::value_count( functor );
+
+      pointer_type ptr_prev = 0 ;
+
+      for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
+
+        pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
+
+        if ( ptr_prev ) {
+          for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
+          ValueJoin::join( functor , ptr + value_count , ptr );
+        }
+        else {
+          ValueInit::init( functor , ptr );
+        }
+
+        ptr_prev = ptr ;
+      }
+    }
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      driver( functor
+            , ValueOps::reference( pointer_type( exec.scratch_reduce() ) )
+            , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
+            , true );
+    }
+/* END #pragma omp parallel */
+
+  }
+
+  //----------------------------------------
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
+{
+private:
+
+  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type  & member )
+    { functor( member ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type  & member )
+    { functor( TagType() , member ); }
+
+public:
+
+  inline
+  ParallelFor( const FunctorType & functor ,
+               const Policy      & policy )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+    OpenMPexec::resize_scratch( 0 , team_reduce_size + team_shmem_size );
+
+#pragma omp parallel
+    {
+      typename Policy::member_type member( * OpenMPexec::get_thread_omp() , policy , team_shmem_size );
+
+      for ( ; member.valid() ; member.next() ) {
+        ParallelFor::template driver< typename Policy::work_tag >( functor , member );
+      }
+    }
+/* END #pragma omp parallel */
+  }
+
+  void wait() {}
+};
+
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
+{
+private:
+
+  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >         Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename PType::member_type  & member
+             ,       reference_type update )
+    { functor( member , update ); }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename PType::member_type  & member
+             ,       reference_type update )
+    { functor( typename PType::work_tag() , member , update ); }
+
+public:
+
+  inline
+  ParallelReduce( const FunctorType  & functor ,
+                  const Policy       & policy )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+
+    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+    OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
+
+      for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
+        ParallelReduce::template driver< Policy >( functor , member , update );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag , reference_type >  Join ;
+
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        Join::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
+    }
+  }
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & functor ,
+                  const Policy       & policy ,
+                  const ViewType     & result )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+
+    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+    OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
+
+      for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
+        ParallelReduce::template driver< Policy >( functor , member , update );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
+
+      const int n = ValueTraits::value_count( functor );
+
+      for ( int j = 0 ; j < n ; ++j ) { result.ptr_on_device()[j] = ptr[j] ; }
+    }
+  }
+
+  void wait() {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@ -0,0 +1,364 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <limits>
+#include <iostream>
+#include <vector>
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <iostream>
+
+#ifdef KOKKOS_HAVE_OPENMP
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel();
+
+int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel()
+{
+#ifndef __CUDA_ARCH__
+  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
+#else
+  return 0;
+#endif
+}
+
+bool s_using_hwloc = false;
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+namespace Impl {
+
+int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+
+int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
+
+OpenMPexec::Pool OpenMPexec::m_pool;
+
+void OpenMPexec::verify_is_process( const char * const label )
+{
+  if ( omp_in_parallel() ) {
+    std::string msg( label );
+    msg.append( " ERROR: in parallel" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPexec::verify_initialized( const char * const label )
+{
+  if ( 0 == m_pool[0] ) {
+    std::string msg( label );
+    msg.append( " ERROR: not initialized" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPexec::clear_scratch()
+{
+#pragma omp parallel
+  {
+    const int rank_rev = m_map_rank[ omp_get_thread_num() ];
+    m_pool.at(rank_rev).clear();
+  }
+/* END #pragma omp parallel */
+}
+
+void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
+{
+  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
+  enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
+
+  const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
+  const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
+
+  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+
+  // Requesting allocation and old allocation is too small:
+
+  const bool allocate = ( old_reduce_size < reduce_size ) ||
+                        ( old_thread_size < thread_size );
+
+  if ( allocate ) {
+    if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
+    if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
+  }
+
+  const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
+  const int    pool_size  = m_pool_topo[0] ;
+
+  if ( allocate ) {
+
+    clear_scratch();
+
+#pragma omp parallel
+    {
+      const int rank_rev = m_map_rank[ omp_get_thread_num() ];
+      const int rank     = pool_size - ( rank_rev + 1 );
+
+      m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
+      new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
+    }
+/* END #pragma omp parallel */
+  }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+int OpenMP::is_initialized()
+{ return 0 != Impl::OpenMPexec::m_pool[0]; }
+
+void OpenMP::initialize( unsigned thread_count ,
+                         unsigned use_numa_count ,
+                         unsigned use_cores_per_numa )
+{
+  // Before any other call to OMP query the maximum number of threads
+  // and save the value for re-initialization unit testing.
+
+  //Using omp_get_max_threads(); is problematic in conjunction with
+  //Hwloc on Intel (essentially an initial call to the OpenMP runtime
+  //without a parallel region before will set a process mask for a single core
+  //The runtime will than bind threads for a parallel region to other cores on the
+  //entering the first parallel region and make the process mask the aggregate of
+  //the thread masks. The intend seems to be to make serial code run fast, if you
+  //compile with OpenMP enabled but don't actually use parallel regions or so
+  //static int omp_max_threads = omp_get_max_threads();
+  int nthreads = 0;
+  #pragma omp parallel
+  {
+    #pragma omp atomic
+    nthreads++;
+  }
+
+  static int omp_max_threads = nthreads;
+
+  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
+
+  bool thread_spawn_failed = false ;
+
+  if ( ! is_initialized ) {
+
+    // Use hwloc thread pinning if concerned with locality.
+    // If spreading threads across multiple NUMA regions.
+    // If hyperthreading is enabled.
+    Impl::s_using_hwloc = hwloc::available() && (
+                            ( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
+                            ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
+
+    std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
+
+    // If hwloc available then use it's maximum value.
+
+    if ( thread_count == 0 ) {
+      thread_count = Impl::s_using_hwloc
+      ? Kokkos::hwloc::get_available_numa_count() *
+        Kokkos::hwloc::get_available_cores_per_numa() *
+        Kokkos::hwloc::get_available_threads_per_core()
+      : omp_max_threads ;
+    }
+
+    if(Impl::s_using_hwloc)
+      hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
+                           false /* do not allow asynchronous */ ,
+                           thread_count ,
+                           use_numa_count ,
+                           use_cores_per_numa ,
+                           threads_coord );
+
+    // Spawn threads:
+
+    omp_set_num_threads( thread_count );
+
+    // Verify OMP interaction:
+    if ( int(thread_count) != omp_get_max_threads() ) {
+      thread_spawn_failed = true ;
+    }
+
+    // Verify spawning and bind threads:
+#pragma omp parallel
+    {
+#pragma omp critical
+      {
+        if ( int(thread_count) != omp_get_num_threads() ) {
+          thread_spawn_failed = true ;
+        }
+
+        // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
+        // Call to 'new' may not be thread safe as well.
+
+        // Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
+
+        const unsigned omp_rank    = omp_get_thread_num();
+        const unsigned thread_r    = Impl::s_using_hwloc ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) : omp_rank ;
+
+        Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
+      }
+/* END #pragma omp critical */
+    }
+/* END #pragma omp parallel */
+
+    if ( ! thread_spawn_failed ) {
+      Impl::OpenMPexec::m_pool_topo[0] = thread_count ;
+      Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
+      Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
+
+      Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+    std::string msg("Kokkos::OpenMP::initialize ERROR");
+
+    if ( is_initialized ) { msg.append(" : already initialized"); }
+    if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }
+
+    Kokkos::Impl::throw_runtime_exception(msg);
+  }
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::finalize()
+{
+  Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
+  Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
+
+  Impl::OpenMPexec::clear_scratch();
+
+  Impl::OpenMPexec::m_pool_topo[0] = 0 ;
+  Impl::OpenMPexec::m_pool_topo[1] = 0 ;
+  Impl::OpenMPexec::m_pool_topo[2] = 0 ;
+
+  omp_set_num_threads(1);
+
+  if ( Impl::s_using_hwloc ) {
+    hwloc::unbind_this_thread();
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::print_configuration( std::ostream & s , const bool detail )
+{
+  Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" );
+
+  s << "Kokkos::OpenMP" ;
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  s << " KOKKOS_HAVE_OPENMP" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+
+  const unsigned numa_count_       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
+    << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
+    ;
+#endif
+
+  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
+
+  if ( is_initialized ) {
+    const int numa_count      = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ;
+    const int core_per_numa   = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
+    const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
+
+    s << " thread_pool_topology[ " << numa_count
+      << " x " << core_per_numa
+      << " x " << thread_per_core
+      << " ]"
+      << std::endl ;
+
+    if ( detail ) {
+      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] );
+
+#pragma omp parallel
+      {
+#pragma omp critical
+        {
+          coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
+        }
+/* END #pragma omp critical */
+      }
+/* END #pragma omp parallel */
+
+      for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
+        s << "  thread omp_rank[" << i << "]"
+          << " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]"
+          << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
+          << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+} // namespace Kokkos
+
+#endif //KOKKOS_HAVE_OPENMP
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@ -0,0 +1,767 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPEXEC_HPP
+#define KOKKOS_OPENMPEXEC_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Data for OpenMP thread execution */
+
+class OpenMPexec {
+public:
+
+  enum { MAX_THREAD_COUNT = 4096 };
+
+  struct Pool
+  {
+    Pool() : m_trackers() {}
+
+    AllocationTracker m_trackers[ MAX_THREAD_COUNT ];
+
+    OpenMPexec * operator[](int i)
+    {
+      return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr());
+    }
+
+    AllocationTracker & at(int i)
+    {
+      return m_trackers[i];
+    }
+  };
+
+private:
+
+  static int          m_pool_topo[ 4 ];
+  static int          m_map_rank[ MAX_THREAD_COUNT ];
+  static Pool         m_pool; // Indexed by: m_pool_rank_rev
+
+  friend class Kokkos::OpenMP ;
+
+  int const  m_pool_rank ;
+  int const  m_pool_rank_rev ;
+  int const  m_scratch_exec_end ;
+  int const  m_scratch_reduce_end ;
+  int const  m_scratch_thread_end ;
+
+  int volatile  m_barrier_state ;
+
+  OpenMPexec();
+  OpenMPexec( const OpenMPexec & );
+  OpenMPexec & operator = ( const OpenMPexec & );
+
+  static void clear_scratch();
+
+public:
+
+  // Topology of a cache coherent thread pool:
+  //   TOTAL = NUMA x GRAIN
+  //   pool_size( depth = 0 )
+  //   pool_size(0) = total number of threads
+  //   pool_size(1) = number of threads per NUMA
+  //   pool_size(2) = number of threads sharing finest grain memory hierarchy
+
+  inline static
+  int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
+
+  inline static
+  OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
+
+  inline int pool_rank() const { return m_pool_rank ; }
+  inline int pool_rank_rev() const { return m_pool_rank_rev ; }
+
+  inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
+  inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
+
+  inline
+  void state_wait( int state )
+    { Impl::spinwait( m_barrier_state , state ); }
+
+  inline
+  void state_set( int state ) { m_barrier_state = state ; }
+
+  ~OpenMPexec() {}
+
+  OpenMPexec( const int poolRank
+            , const int scratch_exec_size
+            , const int scratch_reduce_size
+            , const int scratch_thread_size )
+    : m_pool_rank( poolRank )
+    , m_pool_rank_rev( pool_size() - ( poolRank + 1 ) )
+    , m_scratch_exec_end( scratch_exec_size )
+    , m_scratch_reduce_end( m_scratch_exec_end   + scratch_reduce_size )
+    , m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size )
+    , m_barrier_state(0)
+    {}
+
+  static void finalize();
+
+  static void initialize( const unsigned  team_count ,
+                          const unsigned threads_per_team ,
+                          const unsigned numa_count ,
+                          const unsigned cores_per_numa );
+
+  static void verify_is_process( const char * const );
+  static void verify_initialized( const char * const );
+
+  static void resize_scratch( size_t reduce_size , size_t thread_size );
+
+  inline static
+  OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class OpenMPexecTeamMember {
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  /** \brief  Thread states for team synchronization */
+  enum { Active = 0 , Rendezvous = 1 };
+
+  typedef Kokkos::OpenMP                         execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  Impl::OpenMPexec    & m_exec ;
+  scratch_memory_space  m_team_shared ;
+  int                   m_team_shmem ;
+  int                   m_team_base_rev ;
+  int                   m_team_rank_rev ;
+  int                   m_team_rank ;
+  int                   m_team_size ;
+  int                   m_league_rank ;
+  int                   m_league_end ;
+  int                   m_league_size ;
+
+  // Fan-in team threads, root of the fan-in which does not block returns true
+  inline
+  bool team_fan_in() const
+    {
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
+      }
+
+      if ( m_team_rank_rev ) {
+        m_exec.state_set( Rendezvous );
+        m_exec.state_wait( Rendezvous );
+      }
+
+      return 0 == m_team_rank_rev ;
+    }
+
+  inline
+  void team_fan_out() const
+    {
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+        m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
+      }
+    }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {}
+#else
+    {
+      if ( 1 < m_team_size ) {
+        team_fan_in();
+        team_fan_out();
+      }
+    }
+#endif
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(ValueType& value, const int& thread_id) const
+  {
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { }
+#else
+    // Make sure there is enough scratch space:
+    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
+                         , ValueType , void >::type type ;
+
+    type * const local_value = ((type*) m_exec.scratch_thread());
+    if(team_rank() == thread_id)
+      *local_value = value;
+    memory_fence();
+    team_barrier();
+    value = *local_value;
+#endif
+  }
+
+#ifdef KOKKOS_HAVE_CXX11
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION ValueType
+    team_reduce( const ValueType & value
+               , const JoinOp & op_in ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ValueType(); }
+  #else
+    {
+      typedef ValueType value_type;
+      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
+  #endif
+#else // KOKKOS_HAVE_CXX11
+  template< class JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+    team_reduce( const typename JoinOp::value_type & value
+               , const JoinOp & op ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return typename JoinOp::value_type(); }
+  #else
+    {
+      typedef typename JoinOp::value_type value_type;
+  #endif
+#endif // KOKKOS_HAVE_CXX11
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      type * const local_value = ((type*) m_exec.scratch_thread());
+
+      // Set this thread's contribution
+      *local_value = value ;
+
+      // Fence to make sure the base team member has access:
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        type * const team_value  = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
+
+        // Join to the team value:
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
+        }
+
+        // The base team member may "lap" the other team members,
+        // copy to their local value before proceeding.
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ;
+        }
+
+        // Fence to make sure all team members have access
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *((type volatile const *)local_value);
+    }
+#endif
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ArgType(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
+
+      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+          const type offset = accum ;
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+    { return this-> template team_scan<Type>( value , 0 ); }
+
+  //----------------------------------------
+  // Private for the driver
+
+private:
+
+  typedef execution_space::scratch_memory_space space ;
+
+public:
+
+  template< class Arg0 , class Arg1 >
+  inline
+  OpenMPexecTeamMember( Impl::OpenMPexec & exec
+                      , const TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > & team
+                      , const int shmem_size
+                      )
+    : m_exec( exec )
+    , m_team_shared(0,0)
+    , m_team_shmem( shmem_size )
+    , m_team_base_rev(0)
+    , m_team_rank_rev(0)
+    , m_team_rank(0)
+    , m_team_size( team.team_size() )
+    , m_league_rank(0)
+    , m_league_end(0)
+    , m_league_size( team.league_size() )
+    {
+      const int pool_rank_rev        = m_exec.pool_rank_rev();
+      const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
+      const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
+      const int league_iter_end      = team.league_size() - pool_league_rank_rev * team.team_iter();
+
+      if ( pool_team_rank_rev < m_team_size && 0 < league_iter_end ) {
+        m_team_base_rev  = team.team_alloc() * pool_league_rank_rev ;
+        m_team_rank_rev  = pool_team_rank_rev ;
+        m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
+        m_league_end     = league_iter_end ;
+        m_league_rank    = league_iter_end > team.team_iter() ? league_iter_end - team.team_iter() : 0 ;
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+      }
+    }
+
+  bool valid() const
+    { return m_league_rank < m_league_end ; }
+
+  void next()
+    {
+      if ( ++m_league_rank < m_league_end ) {
+        team_barrier();
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+      }
+    }
+
+  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+};
+
+
+
+} // namespace Impl
+
+template< class Arg0 , class Arg1 >
+class TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >
+{
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicy      execution_policy ;
+
+  //! Execution space of this execution policy.
+  typedef Kokkos::OpenMP  execution_space ;
+
+  typedef typename
+    Impl::if_c< ! Impl::is_same< Kokkos::OpenMP , Arg0 >::value , Arg0 , Arg1 >::type
+      work_tag ;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return execution_space::thread_pool_size(1); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & )
+    { return execution_space::thread_pool_size(2); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& )
+    { return execution_space::thread_pool_size(2); }
+
+  //----------------------------------------
+
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  inline void init( const int league_size_request
+                  , const int team_size_request )
+    {
+      const int pool_size  = execution_space::thread_pool_size(0);
+      const int team_max   = execution_space::thread_pool_size(1);
+      const int team_grain = execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+
+      // Maxumum number of iterations each team will take:
+      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
+    }
+
+public:
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1)
+    { init( league_size_request , team_size_request ); (void) vector_length_request; }
+
+  TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
+    { init( league_size_request , team_size_request ); (void) vector_length_request; }
+
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int team_iter()  const { return m_team_iter ; }
+
+  typedef Impl::OpenMPexecTeamMember member_type ;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline
+int OpenMP::thread_pool_size( int depth )
+{
+  return Impl::OpenMPexec::pool_size(depth);
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMP::thread_pool_rank()
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ];
+#else
+  return -1 ;
+#endif
+}
+
+} // namespace Kokkos
+
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>
+  TeamThreadRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>
+  TeamThreadRange(const Impl::OpenMPexecTeamMember& thread, const iType& begin, const iType& end) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >
+  ThreadVectorRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread);
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,join);
+}
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+}
+
+#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
+
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
@ -0,0 +1,484 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_QTHREAD )
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <utility>
+#include <Kokkos_Qthread.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+// Defines to enable experimental Qthread functionality
+
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread/qthread.h>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
+
+/** s_exec is indexed by the reverse rank of the workers
+ *  for faster fan-in / fan-out lookups
+ *  [ n - 1 , n - 2 , ... , 0 ]
+ */
+QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
+
+int  s_number_shepherds            = 0 ;
+int  s_number_workers_per_shepherd = 0 ;
+int  s_number_workers              = 0 ;
+
+inline
+QthreadExec ** worker_exec()
+{
+  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
+}
+
+const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
+
+int s_worker_reduce_end   = 0 ; /* End of worker reduction memory    */
+int s_worker_shared_end   = 0 ; /* Total of worker scratch memory    */
+int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
+
+QthreadExecFunctionPointer volatile s_active_function = 0 ;
+const void               * volatile s_active_function_arg = 0 ;
+
+} /* namespace */
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+void Qthread::initialize( int thread_count )
+{
+  // Environment variable: QTHREAD_NUM_SHEPHERDS
+  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
+  // Environment variable: QTHREAD_HWPAR
+
+  {
+    char buffer[256];
+    snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
+    putenv(buffer);
+  }
+
+  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
+                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
+                       ( thread_count    == qthread_num_workers() );
+
+  bool ok_symmetry = true ;
+
+  if ( ok_init ) {
+    Impl::s_number_shepherds            = qthread_num_shepherds();
+    Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
+    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
+
+    for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
+      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
+    }
+  }
+
+  if ( ! ok_init || ! ok_symmetry ) {
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
+    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
+    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
+    msg << " : qthread_num_workers = " << qthread_num_workers();
+
+    if ( ! ok_symmetry ) {
+      msg << " : qthread_num_workers_local = {" ;
+      for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
+        msg << " " << qthread_num_workers_local(i) ;
+      }
+      msg << " }" ;
+    }
+
+    Impl::s_number_workers   = 0 ;
+    Impl::s_number_shepherds = 0 ;
+    Impl::s_number_workers_per_shepherd = 0 ;
+
+    if ( ok_init ) { qthread_finalize(); }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+}
+
+void Qthread::finalize()
+{
+  Impl::QthreadExec::clear_workers();
+
+  if ( Impl::s_number_workers ) {
+    qthread_finalize();
+  }
+
+  Impl::s_number_workers    = 0 ;
+  Impl::s_number_shepherds  = 0 ;
+  Impl::s_number_workers_per_shepherd = 0 ;
+}
+
+void Qthread::print_configuration( std::ostream & s , const bool detail )
+{
+  s << "Kokkos::Qthread {"
+    << " num_shepherds(" << Impl::s_number_shepherds << ")"
+    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
+    << " }" << std::endl ;
+}
+
+Qthread & Qthread::instance( int )
+{
+  static Qthread q ;
+  return q ;
+}
+
+void Qthread::fence()
+{
+}
+
+int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
+int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+aligned_t driver_exec_all( void * arg )
+{
+  QthreadExec & exec = **worker_exec();
+
+  (*s_active_function)( exec , s_active_function_arg );
+
+/*
+  fprintf( stdout
+         , "QthreadExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , exec.worker_rank()
+         , exec.worker_size()
+         , exec.shepherd_rank()
+         , exec.shepherd_size()
+         , exec.shepherd_worker_rank()
+         , exec.shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  return 0 ;
+}
+
+aligned_t driver_resize_worker_scratch( void * arg )
+{
+  static volatile int lock_begin = 0 ;
+  static volatile int lock_end   = 0 ;
+
+  QthreadExec ** const exec = worker_exec();
+
+  //----------------------------------------
+  // Serialize allocation for thread safety
+
+  while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
+
+  const bool ok = 0 == *exec ;
+
+  if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
+
+  lock_begin = 0 ; // release lock
+
+  if ( ok ) { new( *exec ) QthreadExec(); }
+
+  //----------------------------------------
+  // Wait for all calls to complete to insure that each worker has executed.
+
+  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
+
+  while ( lock_end );
+
+/*
+  fprintf( stdout
+         , "QthreadExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , (**exec).worker_rank()
+         , (**exec).worker_size()
+         , (**exec).shepherd_rank()
+         , (**exec).shepherd_size()
+         , (**exec).shepherd_worker_rank()
+         , (**exec).shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  //----------------------------------------
+
+  if ( ! ok ) {
+    fprintf( stderr , "Kokkos::QthreadExec resize failed\n" );
+    fflush( stderr );
+  }
+
+  return 0 ;
+}
+
+void verify_is_process( const char * const label , bool not_active = false )
+{
+  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
+  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
+
+  if ( not_process || is_active ) {
+    std::string msg( label );
+    msg.append( " : FAILED" );
+    if ( not_process ) msg.append(" : not called by main process");
+    if ( is_active )   msg.append(" : parallel execution in progress");
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+}
+
+int QthreadExec::worker_per_shepherd()
+{
+  return s_number_workers_per_shepherd ;
+}
+
+QthreadExec::QthreadExec()
+{
+  const int shepherd_rank        = qthread_shep();
+  const int shepherd_worker_rank = qthread_worker_local(NULL);
+  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
+
+  m_worker_base          = s_exec ;
+  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
+  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size ;
+  m_reduce_end           = s_worker_reduce_end ;
+  m_shepherd_rank        = shepherd_rank ;
+  m_shepherd_size        = s_number_shepherds ;
+  m_shepherd_worker_rank = shepherd_worker_rank ;
+  m_shepherd_worker_size = s_number_workers_per_shepherd ;
+  m_worker_rank          = worker_rank ;
+  m_worker_size          = s_number_workers ;
+  m_worker_state         = QthreadExec::Active ;
+}
+
+void QthreadExec::clear_workers()
+{
+  for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
+    QthreadExec * const exec = s_exec[iwork] ;
+    s_exec[iwork] = 0 ;
+    free( exec );
+  }
+}
+
+void QthreadExec::shared_reset( Qthread::scratch_memory_space & space )
+{
+  new( & space )
+    Qthread::scratch_memory_space(
+      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin ,
+      s_worker_shared_end - s_worker_shared_begin
+    );
+}
+
+void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
+{
+  const int exec_all_reduce_alloc = align_alloc( reduce_size );
+  const int shepherd_scan_alloc   = align_alloc( 8 );
+  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
+
+  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
+       s_worker_shared_end < shepherd_shared_end ) {
+
+/*
+  fprintf( stdout , "QthreadExec::resize\n");
+  fflush(stdout);
+*/
+
+    // Clear current worker memory before allocating new worker memory
+    clear_workers();
+
+    // Increase the buffers to an aligned allocation
+    s_worker_reduce_end   = exec_all_reduce_alloc ;
+    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
+    s_worker_shared_end   = shepherd_shared_end ;
+
+    // Need to query which shepherd this main 'process' is running...
+ 
+    const int main_shep = qthread_shep();
+
+    // Have each worker resize its memory for proper first-touch
+#if 0
+    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+    for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
+      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
+    }}
+#else
+    // If this function is used before the 'qthread.task_policy' unit test
+    // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
+    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
+
+      if ( num_clone ) {
+        const int ret = qthread_fork_clones_to_local_priority
+          ( driver_resize_worker_scratch   /* function */
+          , NULL                           /* function data block */
+          , NULL                           /* pointer to return value feb */
+          , jshep                          /* shepherd number */
+          , num_clone - 1                  /* number of instances - 1 */
+          );
+
+        assert(ret == QTHREAD_SUCCESS);
+      }
+    }
+#endif
+
+    driver_resize_worker_scratch( NULL );
+
+    // Verify all workers allocated
+
+    bool ok = true ;
+    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
+
+    if ( ! ok ) {
+      std::ostringstream msg ;
+      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
+      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
+         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
+      }
+      msg << " }" ;
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+}
+
+void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
+{
+  verify_is_process("QthreadExec::exec_all(...)",true);
+
+/*
+  fprintf( stdout , "QthreadExec::exec_all\n");
+  fflush(stdout);
+*/
+
+  s_active_function     = func ;
+  s_active_function_arg = arg ;
+
+  // Need to query which shepherd this main 'process' is running...
+ 
+  const int main_shep = qthread_shep();
+
+#if 0
+  for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+  for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
+    qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
+  }}
+#else
+  // If this function is used before the 'qthread.task_policy' unit test
+  // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
+  for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
+
+    if ( num_clone ) {
+      const int ret = qthread_fork_clones_to_local_priority
+        ( driver_exec_all   /* function */
+        , NULL              /* function data block */
+        , NULL              /* pointer to return value feb */
+        , jshep             /* shepherd number */
+        , num_clone - 1     /* number of instances - 1 */
+        );
+
+      assert(ret == QTHREAD_SUCCESS);
+    }
+  }
+#endif
+
+  driver_exec_all( NULL );
+
+  s_active_function     = 0 ;
+  s_active_function_arg = 0 ;
+}
+
+void * QthreadExec::exec_all_reduce_result()
+{
+  return s_exec[0]->m_scratch_alloc ;
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Impl {
+
+QthreadTeamPolicyMember::QthreadTeamPolicyMember()
+  : m_exec( **worker_exec() )
+  , m_team_shared(0,0)
+  , m_team_size( 1 ) // s_number_workers_per_shepherd )
+  , m_team_rank( 0 ) // m_exec.shepherd_worker_rank() )
+  , m_league_size(1)
+  , m_league_end(1)
+  , m_league_rank(0)
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
+
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
@ -0,0 +1,614 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADEXEC_HPP
+#define KOKKOS_QTHREADEXEC_HPP
+
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+class QthreadExec ;
+
+typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
+
+class QthreadExec {
+private:
+
+  enum { Inactive = 0 , Active = 1 };
+
+  const QthreadExec * const * m_worker_base ;
+  const QthreadExec * const * m_shepherd_base ;
+
+  void  * m_scratch_alloc ;  ///< Scratch memory [ reduce , team , shared ]
+  int     m_reduce_end ;     ///< End of scratch reduction memory
+
+  int     m_shepherd_rank ;
+  int     m_shepherd_size ;
+
+  int     m_shepherd_worker_rank ;
+  int     m_shepherd_worker_size ;
+
+  /*
+   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
+   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
+   */
+  int     m_worker_rank ;
+  int     m_worker_size ;
+
+  int mutable volatile m_worker_state ;
+
+
+  friend class Kokkos::Qthread ;
+
+  ~QthreadExec();
+  QthreadExec( const QthreadExec & );
+  QthreadExec & operator = ( const QthreadExec & );
+
+public:
+
+  QthreadExec();
+
+  /** Execute the input function on all available Qthread workers */
+  static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
+
+  //----------------------------------------
+  /** Barrier across all workers participating in the 'exec_all' */
+  void exec_all_barrier() const
+    {
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  /** Barrier across workers within the shepherd with rank < team_rank */
+  void shepherd_barrier( const int team_size ) const
+    {
+      if ( m_shepherd_worker_rank < team_size ) {
+
+        const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+        int n , j ;
+
+        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+          Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+        }
+
+        if ( rev_rank ) {
+          m_worker_state = QthreadExec::Inactive ;
+          Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+        }
+    
+        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+          m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+        }
+      }
+    }
+
+  //----------------------------------------
+  /** Reduce across all workers participating in the 'exec_all' */
+  template< class FunctorType , class ArgTag >
+  inline
+  void exec_all_reduce( const FunctorType & func ) const
+    {
+      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
+
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        const QthreadExec & fan = *m_worker_base[j];
+
+        Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
+
+        ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  //----------------------------------------
+  /** Scall across all workers participating in the 'exec_all' */
+  template< class FunctorType , class ArgTag >
+  inline
+  void exec_all_scan( const FunctorType & func ) const
+    {
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > ValueInit ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > ValueJoin ;
+      typedef Kokkos::Impl::FunctorValueOps<    FunctorType , ArgTag > ValueOps ;
+
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        // Root thread scans across values before releasing threads
+        // Worker data is in reverse order, so m_worker_base[0] is the 
+        // highest ranking thread.
+
+        // Copy from lower ranking to higher ranking worker.
+        for ( int i = 1 ; i < m_worker_size ; ++i ) {
+          ValueOps::copy( func
+                        , m_worker_base[i-1]->m_scratch_alloc
+                        , m_worker_base[i]->m_scratch_alloc
+                        );
+        }
+
+        ValueInit::init( func , m_worker_base[m_worker_size-1]->m_scratch_alloc );
+
+        // Join from lower ranking to higher ranking worker.
+        // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
+        for ( int i = m_worker_size - 1 ; --i ; ) {
+          ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
+        }
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  //----------------------------------------
+
+  template< class Type>
+  inline
+  volatile Type * shepherd_team_scratch_value() const
+    { return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
+
+  template< class Type >
+  inline
+  void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const
+    {
+      if ( m_shepherd_base ) {
+        Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; }
+        memory_fence();
+        shepherd_barrier( team_size );
+        value = *shared_value ;
+      }
+    }
+
+  template< class Type >
+  inline
+  Type shepherd_reduce( const int team_size , const Type & value ) const
+    {
+      *shepherd_team_scratch_value<Type>() = value ;
+
+      memory_fence();
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        for ( int i = 1 ; i < n ; ++i ) {
+          accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+        }
+        for ( int i = 1 ; i < n ; ++i ) {
+          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
+        }
+
+        memory_fence();
+      }
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+
+      return *shepherd_team_scratch_value<Type>();
+    }
+
+  template< class JoinOp >
+  inline
+  typename JoinOp::value_type
+    shepherd_reduce( const int team_size
+                   , const typename JoinOp::value_type & value
+                   , const JoinOp & op ) const
+    {
+      typedef typename JoinOp::value_type Type ;
+
+      *shepherd_team_scratch_value<Type>() = value ;
+
+      memory_fence();
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        for ( int i = 1 ; i < team_size ; ++i ) {
+          op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
+        }
+        for ( int i = 1 ; i < team_size ; ++i ) {
+          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
+        }
+
+        memory_fence();
+      }
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+
+      return *shepherd_team_scratch_value<Type>();
+    }
+
+  template< class Type >
+  inline
+  Type shepherd_scan( const int team_size
+                    , const Type & value
+                    ,       Type * const global_value = 0 ) const
+    {
+      *shepherd_team_scratch_value<Type>() = value ;
+
+      memory_fence();
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        // Root thread scans across values before releasing threads
+        // Worker data is in reverse order, so m_shepherd_base[0] is the 
+        // highest ranking thread.
+
+        // Copy from lower ranking to higher ranking worker.
+
+        Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        for ( int i = 1 ; i < team_size ; ++i ) {
+          const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+          accum += tmp ;
+          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
+        }
+
+        * m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
+          global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
+
+        // Join from lower ranking to higher ranking worker.
+        for ( int i = team_size ; --i ; ) {
+          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+        }
+
+        memory_fence();
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+
+      return *shepherd_team_scratch_value<Type>();
+    }
+
+  //----------------------------------------
+
+  static inline
+  int align_alloc( int size )
+    {
+      enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
+      enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
+      return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
+    }
+
+  void shared_reset( Qthread::scratch_memory_space & );
+
+  void * exec_all_reduce_value() const { return m_scratch_alloc ; }
+
+  static void * exec_all_reduce_result();
+
+  static void resize_worker_scratch( const int reduce_size , const int shared_size );
+  static void clear_workers();
+
+  //----------------------------------------
+
+  inline int worker_rank() const { return m_worker_rank ; }
+  inline int worker_size() const { return m_worker_size ; }
+  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
+  inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
+  inline int shepherd_rank() const { return m_shepherd_rank ; }
+  inline int shepherd_size() const { return m_shepherd_size ; }
+
+  static int worker_per_shepherd();
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class QthreadTeamPolicyMember {
+private:
+
+  typedef Kokkos::Qthread                        execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+
+        Impl::QthreadExec   & m_exec ;
+  scratch_memory_space        m_team_shared ;
+  const int                   m_team_size ;
+  const int                   m_team_rank ;
+  const int                   m_league_size ;
+  const int                   m_league_end ;
+        int                   m_league_rank ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_team_shared ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {}
+#else
+    { m_exec.shepherd_barrier( m_team_size ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value , int rank ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_broadcast<Type>( value , m_team_size , rank ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_reduce<Type>( m_team_size , value ); }
+#endif
+
+  template< typename JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+    team_reduce( const typename JoinOp::value_type & value
+               , const JoinOp & op ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return typename JoinOp::value_type(); }
+#else
+    { return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
+#endif
+
+  //----------------------------------------
+  // Private driver for task-team parallel
+
+  QthreadTeamPolicyMember();
+
+  //----------------------------------------
+  // Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
+
+  // Initialize
+  template< class Arg0 , class Arg1 >
+  QthreadTeamPolicyMember( Impl::QthreadExec & exec , const TeamPolicy<Arg0,Arg1,Qthread> & team )
+    : m_exec( exec )
+    , m_team_shared(0,0)
+    , m_team_size(   team.m_team_size )
+    , m_team_rank(   exec.shepherd_worker_rank() )
+    , m_league_size( team.m_league_size )
+    , m_league_end(  team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
+    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
+  {
+    m_exec.shared_reset( m_team_shared );
+  }
+
+  // Continue
+  operator bool () const { return m_league_rank < m_league_end ; }
+
+  // iterate
+  void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); }
+};
+
+} // namespace Impl
+
+template< class Arg0 , class Arg1 >
+class TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread >
+{
+private:
+
+  const int m_league_size ;
+  const int m_team_size ;
+  const int m_shepherd_iter ;
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicy  execution_policy ;
+  typedef Qthread     execution_space ;
+
+  typedef typename
+    Impl::if_c< ! Impl::is_same< Kokkos::Qthread , Arg0 >::value , Arg0 , Arg1 >::type
+      work_tag ;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return Qthread::instance().shepherd_worker_size(); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & f )
+    { return team_size_max( f ); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & f , const int& )
+    { return team_size_max( f ); }
+
+  //----------------------------------------
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  // One active team per shepherd
+  TeamPolicy( Kokkos::Qthread & q
+            , const int league_size
+            , const int team_size
+            )
+    : m_league_size( league_size )
+    , m_team_size( team_size < q.shepherd_worker_size()
+                 ? team_size : q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+    {
+    }
+
+  // One active team per shepherd
+  TeamPolicy( const int league_size
+            , const int team_size
+            )
+    : m_league_size( league_size )
+    , m_team_size( team_size < Qthread::instance().shepherd_worker_size()
+                 ? team_size : Qthread::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
+    {
+    }
+
+  typedef Impl::QthreadTeamPolicyMember member_type ;
+
+  friend class Impl::QthreadTeamPolicyMember ;
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_QTHREADEXEC_HPP */
+
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
@ -0,0 +1,643 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
+#define KOKKOS_QTHREAD_PARALLEL_HPP
+
+#include <vector>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <Qthread/Kokkos_QthreadExec.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >  Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i );
+      }
+    }
+
+  // Function is called once by every concurrent thread.
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() ) );
+
+    // All threads wait for completion.
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy
+             )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >  Policy ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i , update );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update );
+      }
+    }
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    driver( self.m_func
+          , ValueInit::init( self.m_func , exec.exec_all_reduce_value() )
+          , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() )
+          );
+
+    exec.template exec_all_reduce<FunctorType, typename Policy::work_tag >( self.m_func );
+  }
+
+public:
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & functor
+                , const Policy       & policy
+                , const HostViewType & result_view )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+
+      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
+
+      if ( result_view.ptr_on_device() ) {
+        const unsigned n = ValueTraits::value_count( m_func );
+        for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
+      }
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelFor< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > >
+{
+private:
+
+  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread >  Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_team ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member ) const
+    { m_func( member ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member ) const
+    { m_func( TagType() , member ); }
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    typename Policy::member_type member( exec , self.m_team );
+
+    while ( member ) {
+      self.ParallelFor::template driver< typename Policy::work_tag >( member );
+      member.team_barrier();
+      member.next_team();
+    }
+
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  ParallelFor( const FunctorType & functor ,
+               const Policy      & policy )
+    : m_func( functor )
+    , m_team( policy )
+    {
+      QthreadExec::resize_worker_scratch
+        ( /* reduction   memory */ 0
+        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelReduce< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > >
+{
+private:
+
+  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread >  Policy ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_team ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member
+             , reference_type update ) const
+    { m_func( member , update ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member
+             , reference_type update ) const
+    { m_func( TagType() , member , update ); }
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    // Initialize thread-local value
+    reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
+
+    typename Policy::member_type member( exec , self.m_team );
+
+    while ( member ) {
+      self.ParallelReduce::template driver< typename Policy::work_tag >( member , update );
+      member.team_barrier();
+      member.next_team();
+    }
+
+    exec.template exec_all_reduce< FunctorType , typename Policy::work_tag >( self.m_func );
+  }
+
+public:
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType & functor ,
+                  const Policy      & policy ,
+                  const ViewType    & result )
+    : m_func( functor )
+    , m_team( policy )
+    {
+      QthreadExec::resize_worker_scratch
+        ( /* reduction   memory */ ValueTraits::value_size( functor )
+        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+
+      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
+
+      const unsigned n = ValueTraits::value_count( m_func );
+      for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >  Policy ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const bool    final
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i , update , final );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const bool    final
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update , final );
+      }
+    }
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+
+    const typename Policy::WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() );
+
+    // Initialize thread-local value
+    reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
+
+    driver( self.m_func , update , false , range );
+
+    exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_func );
+
+    driver( self.m_func , update , true , range );
+
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  ParallelScan( const FunctorType & functor
+              , const Policy      & policy
+              )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::execute , this );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>
+TeamThreadRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count)
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>
+TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread
+               , const iType & begin
+               , const iType & end
+               )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>(thread,begin,end);
+}
+
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >
+  ThreadVectorRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >(thread,count);
+}
+
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember> PerTeam(const Impl::QthreadTeamPolicyMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::QthreadTeamPolicyMember& thread) {
+  return Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */
+
+} // namespace Kokkos
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+
+} // namespace Kokkos
+
+
+#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
@ -0,0 +1,451 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_QTHREAD )
+
+#include <stdio.h>
+
+#include <stdlib.h>
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <Kokkos_Atomic.hpp>
+#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember< Kokkos::Qthread , void , void > Task ;
+
+namespace {
+
+inline
+unsigned padded_sizeof_derived( unsigned sizeof_derived )
+{
+  return sizeof_derived +
+    ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
+}
+
+// int lock_alloc_dealloc = 0 ;
+
+} // namespace
+
+void Task::deallocate( void * ptr )
+{
+  // Counting on 'free' thread safety so lock/unlock not required.
+  // However, isolate calls here to mitigate future need to introduce lock/unlock.
+
+  // lock
+
+  // while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) );
+
+  free( ptr );
+
+  // unlock
+
+  // Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 );
+}
+
+void * Task::allocate( const unsigned arg_sizeof_derived
+                     , const unsigned arg_dependence_capacity )
+{
+  // Counting on 'malloc' thread safety so lock/unlock not required.
+  // However, isolate calls here to mitigate future need to introduce lock/unlock.
+
+  // lock
+
+  // while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) );
+
+  void * const ptr = malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
+
+  // unlock
+
+  // Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 );
+
+  return ptr ;
+}
+
+Task::~TaskMember()
+{
+
+}
+
+
+Task::TaskMember( const function_verify_type        arg_verify
+                , const function_dealloc_type       arg_dealloc
+                , const function_apply_single_type  arg_apply_single
+                , const function_apply_team_type    arg_apply_team
+                , volatile int &                    arg_active_count
+                , const unsigned                    arg_sizeof_derived
+                , const unsigned                    arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  arg_verify )
+  , m_apply_single( arg_apply_single )
+  , m_apply_team( arg_apply_team )
+  , m_active_count( & arg_active_count )
+  , m_qfeb(0)
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
+{
+  qthread_empty( & m_qfeb ); // Set to full when complete
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+Task::TaskMember( const function_dealloc_type       arg_dealloc
+                , const function_apply_single_type  arg_apply_single
+                , const function_apply_team_type    arg_apply_team
+                , volatile int &                    arg_active_count
+                , const unsigned                    arg_sizeof_derived
+                , const unsigned                    arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  & Task::verify_type<void> )
+  , m_apply_single( arg_apply_single )
+  , m_apply_team( arg_apply_team )
+  , m_active_count( & arg_active_count )
+  , m_qfeb(0)
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
+{
+  qthread_empty( & m_qfeb ); // Set to full when complete
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void Task::throw_error_add_dependence() const
+{
+  std::cerr << "TaskMember< Qthread >::add_dependence ERROR"
+            << " state(" << m_state << ")"
+            << " dep_size(" << m_dep_size << ")"
+            << std::endl ;
+  throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR");
+}
+
+void Task::throw_error_verify_type()
+{
+  throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR");
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
+{
+  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ;
+  static const char msg_error_count[]       = ": negative reference count" ;
+  static const char msg_error_complete[]    = ": destroy task that is not complete" ;
+  static const char msg_error_dependences[] = ": destroy task that has dependences" ;
+  static const char msg_error_exception[]   = ": caught internal exception" ;
+
+  if ( rhs ) { Kokkos::atomic_fetch_add( & (*rhs).m_ref_count , 1 ); }
+
+  Task * const lhs_val = Kokkos::atomic_exchange( lhs , rhs );
+
+  if ( lhs_val ) {
+
+    const int count = Kokkos::atomic_fetch_add( & (*lhs_val).m_ref_count , -1 );
+
+    const char * msg_error = 0 ;
+
+    try {
+
+      if ( 1 == count ) {
+
+        // Reference count at zero, delete it
+
+        // Should only be deallocating a completed task
+        if ( (*lhs_val).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) {
+
+          // A completed task should not have dependences...
+          for ( int i = 0 ; i < (*lhs_val).m_dep_size && 0 == msg_error ; ++i ) {
+            if ( (*lhs_val).m_dep[i] ) msg_error = msg_error_dependences ;
+          }
+        }
+        else {
+          msg_error = msg_error_complete ;
+        }
+
+        if ( 0 == msg_error ) {
+          // Get deletion function and apply it
+          const Task::function_dealloc_type d = (*lhs_val).m_dealloc ;
+
+          (*d)( lhs_val );
+        }
+      }
+      else if ( count <= 0 ) {
+        msg_error = msg_error_count ;
+      }
+    }
+    catch( ... ) {
+      if ( 0 == msg_error ) msg_error = msg_error_exception ;
+    }
+
+    if ( 0 != msg_error ) {
+      if ( no_throw ) {
+        std::cerr << msg_error_header << msg_error << std::endl ;
+        std::cerr.flush();
+      }
+      else {
+        std::string msg(msg_error_header);
+        msg.append(msg_error);
+        throw std::runtime_error( msg );
+      }
+    }
+  }
+}
+#endif
+
+
+//----------------------------------------------------------------------------
+
+aligned_t Task::qthread_func( void * arg )
+{
+  Task * const task = reinterpret_cast< Task * >(arg);
+
+  // First member of the team change state to executing.
+  // Use compare-exchange to avoid race condition with a respawn.
+  Kokkos::atomic_compare_exchange_strong( & task->m_state
+                                        , int(Kokkos::Experimental::TASK_STATE_WAITING)
+                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        );
+
+  // It is a single thread's responsibility to close out
+  // this task's execution.
+  bool close_out = false ;
+
+  if ( task->m_apply_team ) {
+
+    Kokkos::Impl::QthreadTeamPolicyMember member ;
+
+    (*task->m_apply_team)( task , member );
+
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       , member.team_rank()
+       , member.team_size()
+       );
+fflush(stdout);
+
+    member.team_barrier();
+
+    close_out = member.team_rank() == 0 ;
+  }
+  else {
+    (*task->m_apply_single)( task );
+
+    close_out = true ;
+  }
+
+  if ( close_out ) {
+
+    // When dependent tasks run there would be a race
+    // condition between destroying this task and
+    // querying the active count pointer from this task.
+    int volatile * active_count = task->m_active_count ;
+
+    if ( task->m_state == ( Kokkos::Experimental::TASK_STATE_WAITING | Kokkos::Experimental::TASK_STATE_EXECUTING ) ) {
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx respawn\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       );
+fflush(stdout);
+#endif
+
+      // Task respawned, set state to waiting and reschedule the task
+      task->m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+      task->schedule();
+    }
+    else {
+
+      // Task did not respawn, is complete
+      task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
+
+      // Release dependences before allowing dependent tasks to run.
+      // Otherwise there is a thread race condition for removing dependences.
+      for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
+        assign( & task->m_dep[i] , 0 );
+      }
+
+      // Set qthread FEB to full so that dependent tasks are allowed to execute.
+      // This 'task' may be deleted immediately following this function call.
+      qthread_fill( & task->m_qfeb );
+    }
+
+    // Decrement active task count before returning.
+    Kokkos::atomic_decrement( active_count );
+  }
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx return\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       );
+fflush(stdout);
+#endif
+
+  return 0 ;
+}
+
+void Task::respawn()
+{
+  // Change state from pure executing to ( waiting | executing )
+  // to avoid confusion with simply waiting.
+  Kokkos::atomic_compare_exchange_strong( & m_state
+                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        , int(Kokkos::Experimental::TASK_STATE_WAITING |
+                                              Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        );
+}
+
+void Task::schedule()
+{
+  // Is waiting for execution
+
+  // Increment active task count before spawning.
+  Kokkos::atomic_increment( m_active_count );
+
+  // spawn in qthread.  must malloc the precondition array and give to qthread.
+  // qthread will eventually free this allocation so memory will not be leaked.
+
+  // concern with thread safety of malloc, does this need to be guarded?
+  aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
+
+  qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
+
+  for ( int i = 0 ; i < m_dep_size ; ++i ) {
+    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
+  }
+
+  if ( m_apply_single ) {
+    qthread_spawn( & Task::qthread_func /* function */
+                 , this                 /* function argument */
+                 , 0
+                 , NULL
+                 , m_dep_size , qprecon /* dependences */
+                 , NO_SHEPHERD
+                 , QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+                 );
+  }
+  else {
+    // If more than one shepherd spawn on a shepherd other than this shepherd
+    const int num_shepherd            = qthread_num_shepherds();
+    const int num_worker_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
+    const int this_shepherd           = qthread_shep();
+
+    int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;
+
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(this)
+       , spawn_shepherd
+       , num_worker_per_shepherd - 1
+       );
+fflush(stdout);
+
+    qthread_spawn_cloneable
+      ( & Task::qthread_func
+      , this
+      , 0
+      , NULL
+      , m_dep_size , qprecon /* dependences */
+      , spawn_shepherd
+      // , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
+      , unsigned( QTHREAD_SPAWN_LOCAL_PRIORITY )
+      , num_worker_per_shepherd - 1
+      );
+  }
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Qthread >::member_type &
+TaskPolicy< Kokkos::Qthread >::member_single()
+{
+  static member_type s ;
+  return s ;
+}
+
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
+{
+  volatile int * const active_task_count = & policy.m_active_count ;
+  while ( *active_task_count ) qthread_yield();
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
+
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
@ -0,0 +1,646 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_QTHREAD_TASKPOLICY_HPP
+#define KOKKOS_QTHREAD_TASKPOLICY_HPP
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+//----------------------------------------------------------------------------
+// Defines to enable experimental Qthread functionality
+
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread.h>
+
+#undef QTHREAD_LOCAL_PRIORITY
+#undef CLONED_TASKS
+
+//----------------------------------------------------------------------------
+
+#include <Kokkos_Qthread.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_View.hpp>
+
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class TaskMember< Kokkos::Qthread , void , void >
+{
+public:
+
+  typedef void         (* function_apply_single_type) ( TaskMember * );
+  typedef void         (* function_apply_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
+  typedef void         (* function_dealloc_type)( TaskMember * );
+  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+
+private:
+
+  const function_dealloc_type       m_dealloc ;       ///< Deallocation
+  const function_verify_type        m_verify ;        ///< Result type verification
+  const function_apply_single_type  m_apply_single ;  ///< Apply function
+  const function_apply_team_type    m_apply_team ;    ///< Apply function
+  int volatile * const              m_active_count ;  ///< Count of active tasks on this policy
+  aligned_t                         m_qfeb ;          ///< Qthread full/empty bit
+  TaskMember ** const               m_dep ;           ///< Dependences
+  const int                         m_dep_capacity ;  ///< Capacity of dependences
+  int                               m_dep_size ;      ///< Actual count of dependences
+  int                               m_ref_count ;     ///< Reference count
+  int                               m_state ;         ///< State of the task
+
+  TaskMember() /* = delete */ ;
+  TaskMember( const TaskMember & ) /* = delete */ ;
+  TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
+
+  static aligned_t qthread_func( void * arg );
+
+  static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
+  static void   deallocate( void * );
+
+  void throw_error_add_dependence() const ;
+  static void throw_error_verify_type();
+
+  template < class DerivedTaskType >
+  static
+  void deallocate( TaskMember * t )
+    {
+      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
+      ptr->~DerivedTaskType();
+      deallocate( (void *) ptr );
+    }
+
+  void schedule();
+
+protected :
+
+  ~TaskMember();
+
+  // Used by TaskMember< Qthread , ResultType , void >
+  TaskMember( const function_verify_type        arg_verify
+            , const function_dealloc_type       arg_dealloc
+            , const function_apply_single_type  arg_apply_single
+            , const function_apply_team_type    arg_apply_team
+            , volatile int &                    arg_active_count
+            , const unsigned                    arg_sizeof_derived
+            , const unsigned                    arg_dependence_capacity
+            );
+
+  // Used for TaskMember< Qthread , void , void >
+  TaskMember( const function_dealloc_type       arg_dealloc
+            , const function_apply_single_type  arg_apply_single
+            , const function_apply_team_type    arg_apply_team
+            , volatile int &                    arg_active_count
+            , const unsigned                    arg_sizeof_derived
+            , const unsigned                    arg_dependence_capacity
+            );
+
+public:
+
+  template< typename ResultType >
+  KOKKOS_FUNCTION static
+  TaskMember * verify_type( TaskMember * t )
+    {
+      enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
+
+      if ( check_type && t != 0 ) {
+
+        // Verify that t->m_verify is this function
+        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
+
+        if ( t->m_verify != self ) {
+          t = 0 ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+          throw_error_verify_type();
+#endif
+        }
+      }
+      return t ;
+    }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *    typedef  FunctorType::value_type  value_type ;
+   *    class DerivedTaskType
+   *      : public TaskMember< Qthread , value_type , FunctorType >
+   *      { ... };
+   *    class TaskMember< Qthread , value_type , FunctorType >
+   *      : public TaskMember< Qthread , value_type , void >
+   *      , public Functor
+   *      { ... };
+   *  If value_type != void
+   *    class TaskMember< Qthread , value_type , void >
+   *      : public TaskMember< Qthread , void , void >
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   *
+   */
+
+  /** \brief  Allocate and construct a single-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_single( const typename DerivedTaskType::functor_type &  arg_functor
+                            , volatile int &                                  arg_active_count
+                            , const unsigned                                  arg_dependence_capacity )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , & TaskMember::template apply_single< functor_type , value_type >
+                         , 0
+                         , arg_active_count
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  /** \brief  Allocate and construct a team-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_team( const typename DerivedTaskType::functor_type &  arg_functor
+                          , volatile int &                                  arg_active_count
+                          , const unsigned                                  arg_dependence_capacity )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , 0
+                         , & TaskMember::template apply_team< functor_type , value_type >
+                         , arg_active_count
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  void respawn();
+  void spawn()
+    {
+       m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+       schedule();
+    }
+
+  //----------------------------------------
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+  //----------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
+#else
+  KOKKOS_INLINE_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  TaskMember * get_dependence( int i ) const
+    { return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence() const
+    { return m_dep_size ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence()
+    {
+      for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
+      m_dep_size = 0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskMember * before )
+    {
+      if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state ||
+             Kokkos::Experimental::TASK_STATE_EXECUTING    == m_state ) &&
+           m_dep_size < m_dep_capacity ) {
+        assign( m_dep + m_dep_size , before );
+        ++m_dep_size ;
+      }
+      else {
+        throw_error_add_dependence();
+      }
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_team( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t
+                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member , m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_team( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t
+                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member );
+    }
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Base class for tasks with a result value in the Qthread execution space.
+ *
+ *  The FunctorType must be void because this class is accessed by the
+ *  Future class for the task and result value.
+ *
+ *  Must be derived from TaskMember<S,void,void> 'root class' so the Future class
+ *  can correctly static_cast from the 'root class' to this class.
+ */
+template < class ResultType >
+class TaskMember< Kokkos::Qthread , ResultType , void >
+  : public TaskMember< Kokkos::Qthread , void , void >
+{
+public:
+
+  ResultType  m_result ;
+
+  typedef const ResultType & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+protected:
+
+  typedef TaskMember< Kokkos::Qthread , void , void >  task_root_type ;
+  typedef task_root_type::function_dealloc_type        function_dealloc_type ;
+  typedef task_root_type::function_apply_single_type   function_apply_single_type ;
+  typedef task_root_type::function_apply_team_type     function_apply_team_type ;
+
+  inline
+  TaskMember( const function_dealloc_type       arg_dealloc
+            , const function_apply_single_type  arg_apply_single
+            , const function_apply_team_type    arg_apply_team
+            , volatile int &                    arg_active_count
+            , const unsigned                    arg_sizeof_derived
+            , const unsigned                    arg_dependence_capacity
+            )
+    : task_root_type( & task_root_type::template verify_type< ResultType >
+                    , arg_dealloc
+                    , arg_apply_single
+                    , arg_apply_team
+                    , arg_active_count
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , m_result()
+    {}
+};
+
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Qthread , ResultType , void >
+  , public FunctorType
+{
+public:
+
+  typedef FunctorType  functor_type ;
+
+  typedef TaskMember< Kokkos::Qthread , void , void >        task_root_type ;
+  typedef TaskMember< Kokkos::Qthread , ResultType , void >  task_base_type ;
+  typedef task_root_type::function_dealloc_type              function_dealloc_type ;
+  typedef task_root_type::function_apply_single_type         function_apply_single_type ;
+  typedef task_root_type::function_apply_team_type           function_apply_team_type ;
+
+  inline
+  TaskMember( const function_dealloc_type       arg_dealloc
+            , const function_apply_single_type  arg_apply_single
+            , const function_apply_team_type    arg_apply_team
+            , volatile int &                    arg_active_count
+            , const unsigned                    arg_sizeof_derived
+            , const unsigned                    arg_dependence_capacity
+            , const functor_type &              arg_functor
+            )
+    : task_base_type( arg_dealloc
+                    , arg_apply_single
+                    , arg_apply_team
+                    , arg_active_count
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , functor_type( arg_functor )
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+void wait( TaskPolicy< Kokkos::Qthread > & );
+
+template<>
+class TaskPolicy< Kokkos::Qthread >
+{
+public:
+
+  typedef Kokkos::Qthread                        execution_space ;
+  typedef Kokkos::Impl::QthreadTeamPolicyMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
+
+  TaskPolicy & operator = ( const TaskPolicy & ) /* = delete */ ;
+
+  template< class FunctorType >
+  static inline
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  static inline
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+  const unsigned  m_default_dependence_capacity ;
+  volatile int    m_active_count_root ;
+  volatile int &  m_active_count ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy()
+    : m_default_dependence_capacity(4)
+    , m_active_count_root(0)
+    , m_active_count( m_active_count_root )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  explicit
+  TaskPolicy( const unsigned arg_default_dependence_capacity )
+    : m_default_dependence_capacity( arg_default_dependence_capacity )
+    , m_active_count_root(0)
+    , m_active_count( m_active_count_root )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy & rhs )
+    : m_default_dependence_capacity( rhs.m_default_dependence_capacity )
+    , m_active_count_root(0)
+    , m_active_count( rhs.m_active_count )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy & rhs
+            , const unsigned arg_default_dependence_capacity )
+    : m_default_dependence_capacity( arg_default_dependence_capacity )
+    , m_active_count_root(0)
+    , m_active_count( rhs.m_active_count )
+    {}
+
+  //----------------------------------------
+
+  template< class ValueType >
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f ) const
+      {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        f.m_task->spawn();
+#endif
+        return f ;
+      }
+
+  // Create single-thread task
+
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create( const FunctorType & functor
+        , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_single< task_type >
+          ( functor
+          , m_active_count
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          )
+#endif
+        );
+    }
+
+  // Create thread-team task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_team( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_team< task_type >
+          ( functor
+          , m_active_count
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          )
+#endif
+        );
+    }
+
+  // Add dependence
+  template< class A1 , class A2 , class A3 , class A4 >
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      after.m_task->add_dependence( before.m_task );
+#endif
+    }
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        get_task_root(task_functor)->get_dependence(i)
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  int get_dependence( const FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return get_task_root(task_functor)->get_dependence(); }
+#else
+    { return 0 ; }
+#endif
+
+  template< class FunctorType >
+  void clear_dependence( FunctorType * task_functor ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->clear_dependence();
+#endif
+    }
+
+  template< class FunctorType , class A3 , class A4 >
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->add_dependence( before.m_task );
+#endif
+    }
+
+  template< class FunctorType >
+  void respawn( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->respawn(); }
+#else
+    {}
+#endif
+
+  static member_type & member_single();
+
+  friend void wait( TaskPolicy< Kokkos::Qthread > & );
+};
+
+} /* namespace Experimental */
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
+
--- a/lib/kokkos/core/src/Qthread/README
+++ b/lib/kokkos/core/src/Qthread/README
@ -0,0 +1,28 @@
+
+# This Qthreads back-end uses an experimental branch of the Qthreads repository with special #define options.
+
+# Cloning repository and branch:
+
+git clone https://github.com/stelleg/qthreads qthreads-with-clone
+
+cd qthreads-with-clone
+
+# Added to ./git/config
+#
+# [branch "cloned_tasks"]
+#        remote = origin
+#        merge = refs/heads/cloned_tasks
+#
+
+git branch cloned_tasks
+git checkout cloned_tasks
+git pull
+
+sh autogen.sh
+
+# configurure with 'hwloc' installation:
+
+./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}
+
+
+
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@ -0,0 +1,758 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD )
+
+#include <stdint.h>
+#include <limits>
+#include <utility>
+#include <iostream>
+#include <sstream>
+#include <Kokkos_Threads.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+ThreadsExec                  s_threads_process ;
+ThreadsExec                * s_threads_exec[  ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
+pthread_t                    s_threads_pid[   ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
+std::pair<unsigned,unsigned> s_threads_coord[ ThreadsExec::MAX_THREAD_COUNT ];
+
+int s_thread_pool_size[3] = { 0 , 0 , 0 };
+
+unsigned s_current_reduce_size = 0 ;
+unsigned s_current_shared_size = 0 ;
+
+void (* volatile s_current_function)( ThreadsExec & , const void * );
+const void * volatile s_current_function_arg = 0 ;
+
+struct Sentinel {
+  Sentinel()
+  {
+    HostSpace::register_in_parallel( ThreadsExec::in_parallel );
+  }
+
+  ~Sentinel()
+  {
+    if ( s_thread_pool_size[0] ||
+         s_thread_pool_size[1] ||
+         s_thread_pool_size[2] ||
+         s_current_reduce_size ||
+         s_current_shared_size ||
+         s_current_function ||
+         s_current_function_arg ||
+         s_threads_exec[0] ) {
+      std::cerr << "ERROR : Process exiting without calling Kokkos::Threads::terminate()" << std::endl ;
+    }
+  }
+};
+
+inline
+unsigned fan_size( const unsigned rank , const unsigned size )
+{
+  const unsigned rank_rev = size - ( rank + 1 );
+  unsigned count = 0 ;
+  for ( unsigned n = 1 ; ( rank_rev + n < size ) && ! ( rank_rev & n ) ; n <<= 1 ) { ++count ; }
+  return count ;
+}
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void execute_function_noop( ThreadsExec & , const void * ) {}
+
+void ThreadsExec::driver(void)
+{
+  ThreadsExec this_thread ;
+
+  while ( ThreadsExec::Active == this_thread.m_pool_state ) {
+
+    (*s_current_function)( this_thread , s_current_function_arg );
+
+    // Deactivate thread and wait for reactivation
+    this_thread.m_pool_state = ThreadsExec::Inactive ;
+
+    wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive );
+  }
+}
+
+ThreadsExec::ThreadsExec()
+  : m_pool_base(0)
+  , m_scratch()
+  , m_scratch_reduce_end(0)
+  , m_scratch_thread_end(0)
+  , m_numa_rank(0)
+  , m_numa_core_rank(0)
+  , m_pool_rank(0)
+  , m_pool_size(0)
+  , m_pool_fan_size(0)
+  , m_pool_state( ThreadsExec::Terminating )
+{
+  if ( & s_threads_process != this ) {
+
+    // A spawned thread
+
+    ThreadsExec * const nil = 0 ;
+
+    // Which entry in 's_threads_exec', possibly determined from hwloc binding
+    const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0])
+                    ? ((size_t)s_current_function_arg)
+                    : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord ));
+
+    // Given a good entry set this thread in the 's_threads_exec' array
+    if ( entry < s_thread_pool_size[0] &&
+         nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) {
+
+      const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+      m_numa_rank       = coord.first ;
+      m_numa_core_rank  = coord.second ;
+      m_pool_base       = s_threads_exec ;
+      m_pool_rank       = s_thread_pool_size[0] - ( entry + 1 );
+      m_pool_size       = s_thread_pool_size[0] ;
+      m_pool_fan_size   = fan_size( m_pool_rank , m_pool_size );
+      m_pool_state      = ThreadsExec::Active ;
+
+      s_threads_pid[ m_pool_rank ] = pthread_self();
+
+      // Inform spawning process that the threads_exec entry has been set.
+      s_threads_process.m_pool_state = ThreadsExec::Active ;
+    }
+    else {
+      // Inform spawning process that the threads_exec entry could not be set.
+      s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+    }
+  }
+  else {
+    // Enables 'parallel_for' to execute on unitialized Threads device
+    m_pool_rank  = 0 ;
+    m_pool_size  = 1 ;
+    m_pool_state = ThreadsExec::Inactive ;
+
+    s_threads_pid[ m_pool_rank ] = pthread_self();
+  }
+}
+
+ThreadsExec::~ThreadsExec()
+{
+  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
+
+  m_pool_base   = 0 ;
+  m_scratch.clear();
+  m_scratch_reduce_end = 0 ;
+  m_scratch_thread_end = 0 ;
+  m_numa_rank      = 0 ;
+  m_numa_core_rank = 0 ;
+  m_pool_rank      = 0 ;
+  m_pool_size      = 0 ;
+  m_pool_fan_size  = 0 ;
+
+  m_pool_state  = ThreadsExec::Terminating ;
+
+  if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) {
+    ThreadsExec * const nil = 0 ;
+
+    atomic_compare_exchange( s_threads_exec + entry , this , nil );
+
+    s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+  }
+}
+
+
+int ThreadsExec::get_thread_count()
+{
+  return s_thread_pool_size[0] ;
+}
+
+ThreadsExec * ThreadsExec::get_thread( const int init_thread_rank )
+{
+  ThreadsExec * const th =
+    init_thread_rank < s_thread_pool_size[0]
+    ? s_threads_exec[ s_thread_pool_size[0] - ( init_thread_rank + 1 ) ] : 0 ;
+
+  if ( 0 == th || th->m_pool_rank != init_thread_rank ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : "
+        << "thread " << init_thread_rank << " of " << s_thread_pool_size[0] ;
+    if ( 0 == th ) {
+      msg << " does not exist" ;
+    }
+    else {
+      msg << " has wrong thread_rank " << th->m_pool_rank ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return th ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
+{
+  ThreadsExec::global_lock();
+  ThreadsExec::global_unlock();
+
+  const int n = exec.m_pool_fan_size ;
+  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
+
+  for ( int i = 0 ; i < n ; ++i ) {
+    Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+  }
+
+  exec.m_pool_state = ThreadsExec::Inactive ;
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void ThreadsExec::verify_is_process( const std::string & name , const bool initialized )
+{
+  if ( ! is_process() ) {
+    std::string msg( name );
+    msg.append( " FAILED : Called by a worker thread, can only be called by the master process." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+  if ( initialized && 0 == s_thread_pool_size[0] ) {
+    std::string msg( name );
+    msg.append( " FAILED : Threads not initialized." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+int ThreadsExec::in_parallel()
+{
+  // A thread function is in execution and
+  // the function argument is not the special threads process argument and
+  // the master process is a worker or is not the master process.
+  return s_current_function &&
+         ( & s_threads_process != s_current_function_arg ) &&
+         ( s_threads_process.m_pool_base || ! is_process() );
+}
+
+// Wait for root thread to become inactive
+void ThreadsExec::fence()
+{
+  if ( s_thread_pool_size[0] ) {
+    // Wait for the root thread to complete:
+    Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+  }
+
+  s_current_function     = 0 ;
+  s_current_function_arg = 0 ;
+}
+
+/** \brief  Begin execution of the asynchronous functor */
+void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg )
+{
+  verify_is_process("ThreadsExec::start" , true );
+
+  if ( s_current_function || s_current_function_arg ) {
+    Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) );
+  }
+
+  s_current_function     = func ;
+  s_current_function_arg = arg ;
+
+  // Activate threads:
+  for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) {
+    s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  if ( s_threads_process.m_pool_size ) {
+    // Master process is the root thread, run it:
+    (*func)( s_threads_process , arg );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::sleep()
+{
+  verify_is_process("ThreadsExec::sleep", true );
+
+  if ( & execute_sleep == s_current_function ) return false ;
+
+  fence();
+
+  ThreadsExec::global_lock();
+
+  s_current_function = & execute_sleep ;
+
+  // Activate threads:
+  for ( unsigned i = s_thread_pool_size[0] ; 0 < i ; ) {
+    s_threads_exec[--i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  return true ;
+}
+
+bool ThreadsExec::wake()
+{
+  verify_is_process("ThreadsExec::wake", true );
+
+  if ( & execute_sleep != s_current_function ) return false ;
+
+  ThreadsExec::global_unlock();
+
+  if ( s_threads_process.m_pool_base ) {
+    execute_sleep( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  fence();
+
+  return true ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
+{
+  s_current_function = func ;
+  s_current_function_arg = & s_threads_process ;
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
+    ThreadsExec & th = * s_threads_exec[ --i ];
+
+    th.m_pool_state = ThreadsExec::Active ;
+
+    wait_yield( th.m_pool_state , ThreadsExec::Active );
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    s_threads_process.m_pool_state = ThreadsExec::Active ;
+    (*func)( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  s_current_function_arg = 0 ;
+  s_current_function = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void * ThreadsExec::root_reduce_scratch()
+{
+  return s_threads_process.reduce_memory();
+}
+
+void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
+{
+  exec.m_scratch.clear();
+
+  exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
+  exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
+
+  if ( s_threads_process.m_scratch_thread_end ) {
+
+    exec.m_scratch =
+      HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
+
+    unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
+    unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
+
+    // touch on this thread
+    while ( ptr < end ) *ptr++ = 0 ;
+  }
+}
+
+void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
+{
+  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
+
+  fence();
+
+  const size_t old_reduce_size = s_threads_process.m_scratch_reduce_end ;
+  const size_t old_thread_size = s_threads_process.m_scratch_thread_end - s_threads_process.m_scratch_reduce_end ;
+
+  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+
+  // Increase size or deallocate completely.
+
+  if ( ( old_reduce_size < reduce_size ) ||
+       ( old_thread_size < thread_size ) ||
+       ( ( reduce_size == 0 && thread_size == 0 ) &&
+         ( old_reduce_size != 0 || old_thread_size != 0 ) ) ) {
+
+    verify_is_process( "ThreadsExec::resize_scratch" , true );
+
+    s_threads_process.m_scratch_reduce_end = reduce_size ;
+    s_threads_process.m_scratch_thread_end = reduce_size + thread_size ;
+
+    execute_serial( & execute_resize_scratch );
+
+    s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
+  }
+
+  return s_threads_process.m_scratch.alloc_ptr() ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
+{
+  verify_is_process("ThreadsExec::print_configuration",false);
+
+  fence();
+
+  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  // Forestall compiler warnings for unused variables.
+  (void) numa_count;
+  (void) cores_per_numa;
+  (void) threads_per_core;
+
+  s << "Kokkos::Threads" ;
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  s << " KOKKOS_HAVE_PTHREAD" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+  s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
+#endif
+
+  if ( s_thread_pool_size[0] ) {
+    s << " threads[" << s_thread_pool_size[0] << "]"
+      << " threads_per_numa[" << s_thread_pool_size[1] << "]"
+      << " threads_per_core[" << s_thread_pool_size[2] << "]"
+      ;
+    if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; }
+    s << " ReduceScratch[" << s_current_reduce_size << "]"
+      << " SharedScratch[" << s_current_shared_size << "]" ;
+    s << std::endl ;
+
+    if ( detail ) {
+
+      for ( int i = 0 ; i < s_thread_pool_size[0] ; ++i ) {
+
+        ThreadsExec * const th = s_threads_exec[i] ;
+
+        if ( th ) {
+
+          const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 );
+
+          s << " Thread[ " << th->m_pool_rank << " : "
+            << th->m_numa_rank << "." << th->m_numa_core_rank << " ]" ;
+
+          s << " Fan{" ;
+          for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) {
+            ThreadsExec * const thfan = th->m_pool_base[rank_rev+(1<<j)] ;
+            s << " [ " << thfan->m_pool_rank << " : "
+              << thfan->m_numa_rank << "." << thfan->m_numa_core_rank << " ]" ;
+          }
+          s << " }" ;
+
+          if ( th == & s_threads_process ) {
+            s << " is_process" ;
+          }
+        }
+        s << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+int ThreadsExec::is_initialized()
+{ return 0 != s_threads_exec[0] ; }
+
+void ThreadsExec::initialize( unsigned thread_count ,
+                              unsigned use_numa_count ,
+                              unsigned use_cores_per_numa ,
+                              bool allow_asynchronous_threadpool )
+{
+  static const Sentinel sentinel ;
+
+  const bool is_initialized = 0 != s_thread_pool_size[0] ;
+
+  unsigned thread_spawn_failed = 0 ;
+
+  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
+    s_threads_exec[i] = NULL;
+
+  if ( ! is_initialized ) {
+
+    // If thread_count, use_numa_count, or use_cores_per_numa are zero
+    // then they will be given default values based upon hwloc detection
+    // and allowed asynchronous execution.
+
+    const bool hwloc_avail = hwloc::available();
+
+    if ( thread_count == 0 ) {
+      thread_count = hwloc_avail
+      ? Kokkos::hwloc::get_available_numa_count() *
+        Kokkos::hwloc::get_available_cores_per_numa() *
+        Kokkos::hwloc::get_available_threads_per_core()
+      : 1 ;
+    }
+
+    const unsigned thread_spawn_begin =
+      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
+                             allow_asynchronous_threadpool ,
+                             thread_count ,
+                             use_numa_count ,
+                             use_cores_per_numa ,
+                             s_threads_coord );
+
+    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;
+
+    if ( thread_spawn_begin ) {
+      // Synchronous with s_threads_coord[0] as the process core
+      // Claim entry #0 for binding the process core.
+      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
+    }
+
+    s_thread_pool_size[0] = thread_count ;
+    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
+    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
+    s_current_function = & execute_function_noop ; // Initialization work function
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+      // If hwloc available then spawned thread will
+      // choose its own entry in 's_threads_coord'
+      // otherwise specify the entry.
+      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith );
+
+      // Spawn thread executing the 'driver()' function.
+      // Wait until spawned thread has attempted to initialize.
+      // If spawning and initialization is successfull then
+      // an entry in 's_threads_exec' will be assigned.
+      if ( ThreadsExec::spawn() ) {
+        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+      }
+      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
+    }
+
+    // Wait for all spawned threads to deactivate before zeroing the function.
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+      // Try to protect against cache coherency failure by casting to volatile.
+      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
+      if ( th ) {
+        wait_yield( th->m_pool_state , ThreadsExec::Active );
+      }
+      else {
+        ++thread_spawn_failed ;
+      }
+    }
+
+    s_current_function     = 0 ;
+    s_current_function_arg = 0 ;
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+    if ( ! thread_spawn_failed ) {
+      // Bind process to the core on which it was located before spawning occured
+      Kokkos::hwloc::bind_this_thread( proc_coord );
+
+      if ( thread_spawn_begin ) { // Include process in pool.
+        const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+        s_threads_exec[0]                   = & s_threads_process ;
+        s_threads_process.m_numa_rank       = coord.first ;
+        s_threads_process.m_numa_core_rank  = coord.second ;
+        s_threads_process.m_pool_base       = s_threads_exec ;
+        s_threads_process.m_pool_rank       = thread_count - 1 ; // Reversed for scan-compatible reductions
+        s_threads_process.m_pool_size       = thread_count ;
+        s_threads_process.m_pool_fan_size   = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
+        s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
+      }
+      else {
+        s_threads_process.m_pool_base = 0 ;
+        s_threads_process.m_pool_rank = 0 ;
+        s_threads_process.m_pool_size = 0 ;
+        s_threads_process.m_pool_fan_size = 0 ;
+      }
+
+      // Initial allocations:
+      ThreadsExec::resize_scratch( 1024 , 1024 );
+    }
+    else {
+      s_thread_pool_size[0] = 0 ;
+      s_thread_pool_size[1] = 0 ;
+      s_thread_pool_size[2] = 0 ;
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Threads::initialize ERROR" ;
+
+    if ( is_initialized ) {
+      msg << " : already initialized" ;
+    }
+    if ( thread_spawn_failed ) {
+      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::finalize()
+{
+  verify_is_process("ThreadsExec::finalize",false);
+
+  fence();
+
+  resize_scratch(0,0);
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) {
+
+    if ( s_threads_exec[i] ) {
+
+      s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ;
+
+      wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+    }
+
+    s_threads_pid[i] = 0 ;
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    ( & s_threads_process )->~ThreadsExec();
+    s_threads_exec[0] = 0 ;
+  }
+
+  Kokkos::hwloc::unbind_this_thread();
+
+  s_thread_pool_size[0] = 0 ;
+  s_thread_pool_size[1] = 0 ;
+  s_thread_pool_size[2] = 0 ;
+
+  // Reset master thread to run solo.
+  s_threads_process.m_numa_rank       = 0 ;
+  s_threads_process.m_numa_core_rank  = 0 ;
+  s_threads_process.m_pool_base       = 0 ;
+  s_threads_process.m_pool_rank       = 0 ;
+  s_threads_process.m_pool_size       = 1 ;
+  s_threads_process.m_pool_fan_size   = 0 ;
+  s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+Threads & Threads::instance(int)
+{
+  static Threads t ;
+  return t ;
+}
+
+int Threads::thread_pool_size( int depth )
+{
+  return Impl::s_thread_pool_size[depth];
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+int Threads::thread_pool_rank()
+{
+  const pthread_t pid = pthread_self();
+  int i = 0;
+  while ( ( i < Impl::s_thread_pool_size[0] ) && ( pid != Impl::s_threads_pid[i] ) ) { ++i ; }
+  return i ;
+}
+#endif
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD ) */
+
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@ -0,0 +1,465 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADSEXEC_HPP
+#define KOKKOS_THREADSEXEC_HPP
+
+#include <stdio.h>
+
+#include <utility>
+#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class ThreadsExec {
+public:
+
+  // Fan array has log_2(NT) reduction threads plus 2 scan threads
+  // Currently limited to 16k threads.
+  enum { MAX_FAN_COUNT    = 16 };
+  enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
+  enum { VECTOR_LENGTH    = 8 };
+
+  /** \brief States of a worker thread */
+  enum { Terminating ///<  Termination in progress
+       , Inactive    ///<  Exists, waiting for work
+       , Active      ///<  Exists, performing work
+       , Rendezvous  ///<  Exists, waiting in a barrier or reduce
+
+       , ScanCompleted
+       , ScanAvailable
+       , ReductionAvailable
+       };
+
+private:
+
+  friend class Kokkos::Threads ;
+
+  // Fan-in operations' root is the highest ranking thread
+  // to place the 'scan' reduction intermediate values on
+  // the threads that need them.
+  // For a simple reduction the thread location is arbitrary.
+
+  ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
+
+  Impl::AllocationTracker m_scratch ;
+  int           m_scratch_reduce_end ;
+  int           m_scratch_thread_end ;
+  int           m_numa_rank ;
+  int           m_numa_core_rank ;
+  int           m_pool_rank ;
+  int           m_pool_size ;
+  int           m_pool_fan_size ;
+  int volatile  m_pool_state ;  ///< State for global synchronizations
+
+
+  static void global_lock();
+  static void global_unlock();
+  static bool spawn();
+
+  static void execute_resize_scratch( ThreadsExec & , const void * );
+  static void execute_sleep(          ThreadsExec & , const void * );
+
+  ThreadsExec( const ThreadsExec & );
+  ThreadsExec & operator = ( const ThreadsExec & );
+
+  static void execute_serial( void (*)( ThreadsExec & , const void * ) );
+
+public:
+
+  KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size ; }
+  KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank ; }
+  KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank ; }
+  KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank ; }
+
+  static int get_thread_count();
+  static ThreadsExec * get_thread( const int init_thread_rank );
+
+  inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
+  KOKKOS_INLINE_FUNCTION  void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
+
+  KOKKOS_INLINE_FUNCTION  int volatile & state() { return m_pool_state ; }
+  KOKKOS_INLINE_FUNCTION  ThreadsExec * const * pool_base() const { return m_pool_base ; }
+
+  static void driver(void);
+
+  ~ThreadsExec();
+  ThreadsExec();
+
+  static void * resize_scratch( size_t reduce_size , size_t thread_size );
+
+  static void * root_reduce_scratch();
+
+  static bool is_process();
+
+  static void verify_is_process( const std::string & , const bool initialized );
+
+  static int is_initialized();
+
+  static void initialize( unsigned thread_count ,
+                          unsigned use_numa_count ,
+                          unsigned use_cores_per_numa ,
+                          bool allow_asynchronous_threadpool );
+
+  static void finalize();
+
+  /* Given a requested team size, return valid team size */
+  static unsigned team_size_valid( unsigned );
+
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //------------------------------------
+
+  static void wait_yield( volatile int & , const int );
+
+  //------------------------------------
+  // All-thread functions:
+
+  inline
+  int all_reduce( const int value )
+    {
+      // Make sure there is enough scratch space:
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      *((volatile int*) reduce_memory()) = value ;
+
+      memory_fence();
+
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the reduction and broadcast
+
+        int accum = 0 ;
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          accum += *((volatile int *) get_thread( rank )->reduce_memory());
+        }
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          *((volatile int *) get_thread( rank )->reduce_memory()) = accum ;
+        }
+
+        memory_fence();
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          get_thread( rank )->m_pool_state = ThreadsExec::Active ;
+        }
+      }
+
+      return *((volatile int*) reduce_memory());
+    }
+
+  //------------------------------------
+  // All-thread functions:
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void fan_in_reduce( const FunctorType & f ) const
+    {
+      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorFinal<     FunctorType , ArgTag > Final ;
+
+      const int rev_rank  = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+
+        ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
+
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+
+        Join::join( f , reduce_memory() , fan.reduce_memory() );
+      }
+
+      if ( ! rev_rank ) {
+        Final::final( f , reduce_memory() );
+      }
+    }
+
+  inline
+  void fan_in() const
+    {
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
+      }
+    }
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void scan_large( const FunctorType & f )
+    {
+      // Sequence of states:
+      //  0) Active             : entry and exit state
+      //  1) ReductionAvailable : reduction value available
+      //  2) ScanAvailable      : inclusive scan value available
+      //  3) Rendezvous         : All threads inclusive scan value are available
+      //  4) ScanCompleted      : exclusive scan value copied
+
+      typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > Init ;
+
+      typedef typename Traits::value_type scalar_type ;
+
+      const int      rev_rank = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Traits::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_memory();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
+
+        // Wait: Active -> ReductionAvailable (or ScanAvailable)
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        Join::join( f , work_value , fan.reduce_memory() );
+      }
+
+      // Copy reduction value to scan value before releasing from this phase.
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i] ; }
+
+      if ( rev_rank ) {
+
+        // Set: Active -> ReductionAvailable
+        m_pool_state = ThreadsExec::ReductionAvailable ;
+
+        // Wait for contributing threads' scan value to be available.
+        if ( ( 1 << m_pool_fan_size ) < ( m_pool_rank + 1 ) ) {
+          ThreadsExec & th = *m_pool_base[ rev_rank + ( 1 << m_pool_fan_size ) ] ;
+
+          // Wait: Active             -> ReductionAvailable
+          // Wait: ReductionAvailable -> ScanAvailable
+          Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
+
+          Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
+        }
+
+        // This thread has completed inclusive scan
+        // Set: ReductionAvailable -> ScanAvailable
+        m_pool_state = ThreadsExec::ScanAvailable ;
+
+        // Wait for all threads to complete inclusive scan
+        // Wait: ScanAvailable -> Rendezvous
+        Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
+      }
+
+      //--------------------------------
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
+        // Wait: ReductionAvailable -> ScanAvailable
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        // Set: ScanAvailable -> Rendezvous
+        fan.m_pool_state = ThreadsExec::Rendezvous ;
+      }
+
+      // All threads have completed the inclusive scan.
+      // All non-root threads are in the Rendezvous state.
+      // Threads are free to overwrite their reduction value.
+      //--------------------------------
+
+      if ( ( rev_rank + 1 ) < m_pool_size ) {
+        // Exclusive scan: copy the previous thread's inclusive scan value
+
+        ThreadsExec & th = *m_pool_base[ rev_rank + 1 ] ; // Not the root thread
+
+        const scalar_type * const src_value = ((scalar_type *)th.reduce_memory()) + count ;
+
+        for ( unsigned j = 0 ; j < count ; ++j ) { work_value[j] = src_value[j]; }
+      }
+      else {
+        (void) Init::init( f , work_value );
+      }
+
+      //--------------------------------
+      // Wait for all threads to copy previous thread's inclusive scan value
+      // Wait for all threads: Rendezvous -> ScanCompleted
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+      }
+      if ( rev_rank ) {
+        // Set: ScanAvailable -> ScanCompleted
+        m_pool_state = ThreadsExec::ScanCompleted ;
+        // Wait: ScanCompleted -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
+      }
+      // Set: ScanCompleted -> Active
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void scan_small( const FunctorType & f )
+    {
+      typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > Init ;
+
+      typedef typename Traits::value_type scalar_type ;
+
+      const int      rev_rank = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Traits::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_memory();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the thread-scan before releasing threads
+
+        scalar_type * ptr_prev = 0 ;
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          scalar_type * const ptr = (scalar_type *) get_thread( rank )->reduce_memory();
+          if ( rank ) {
+            for ( unsigned i = 0 ; i < count ; ++i ) { ptr[i] = ptr_prev[ i + count ]; }
+            Join::join( f , ptr + count , ptr );
+          }
+          else {
+            (void) Init::init( f , ptr );
+          }
+          ptr_prev = ptr ;
+        }
+      }
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  //------------------------------------
+  /** \brief  Wait for previous asynchronous functor to
+   *          complete and release the Threads device.
+   *          Acquire the Threads device and start this functor.
+   */
+  static void start( void (*)( ThreadsExec & , const void * ) , const void * );
+
+  static int  in_parallel();
+  static void fence();
+  static bool sleep();
+  static bool wake();
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline int Threads::in_parallel()
+{ return Impl::ThreadsExec::in_parallel(); }
+
+inline int Threads::is_initialized()
+{ return Impl::ThreadsExec::is_initialized(); }
+
+inline void Threads::initialize(
+  unsigned threads_count ,
+  unsigned use_numa_count ,
+  unsigned use_cores_per_numa ,
+  bool allow_asynchronous_threadpool )
+{
+  Impl::ThreadsExec::initialize( threads_count , use_numa_count , use_cores_per_numa , allow_asynchronous_threadpool );
+}
+
+inline void Threads::finalize()
+{
+  Impl::ThreadsExec::finalize();
+}
+
+inline void Threads::print_configuration( std::ostream & s , const bool detail )
+{
+  Impl::ThreadsExec::print_configuration( s , detail );
+}
+
+inline bool Threads::sleep()
+{ return Impl::ThreadsExec::sleep() ; }
+
+inline bool Threads::wake()
+{ return Impl::ThreadsExec::wake() ; }
+
+inline void Threads::fence()
+{ Impl::ThreadsExec::fence() ; }
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADSEXEC_HPP */
+
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@ -0,0 +1,254 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+/* Standard 'C' Linux libraries */
+
+#include <pthread.h>
+#include <sched.h>
+#include <errno.h>
+
+/* Standard C++ libaries */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Threads.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER ;
+
+// Pthreads compatible driver.
+// Recovery from an exception would require constant intra-thread health
+// verification; which would negatively impact runtime.  As such simply
+// abort the process.
+
+void * internal_pthread_driver( void * )
+{
+  try {
+    ThreadsExec::driver();
+  }
+  catch( const std::exception & x ) {
+    std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  catch( ... ) {
+    std::cerr << "Exception thrown from worker thread" << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  return NULL ;
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+// Spawn a thread
+
+bool ThreadsExec::spawn()
+{
+  bool result = false ;
+
+  pthread_attr_t attr ;
+
+  if ( 0 == pthread_attr_init( & attr ) ||
+       0 == pthread_attr_setscope(       & attr, PTHREAD_SCOPE_SYSTEM ) ||
+       0 == pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
+
+    pthread_t pt ;
+
+    result = 0 == pthread_create( & pt, & attr, internal_pthread_driver, 0 );
+  }
+
+  pthread_attr_destroy( & attr );
+
+  return result ;
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::is_process()
+{
+  static const pthread_t master_pid = pthread_self();
+
+  return pthread_equal( master_pid , pthread_self() );
+}
+
+void ThreadsExec::global_lock()
+{
+  pthread_mutex_lock( & host_internal_pthread_mutex );
+}
+
+void ThreadsExec::global_unlock()
+{
+  pthread_mutex_unlock( & host_internal_pthread_mutex );
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value )
+{
+  while ( value == flag ) { sched_yield(); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/* end #if defined( KOKKOS_HAVE_PTHREAD ) */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_HAVE_WINTHREAD )
+
+/* Windows libraries */
+#include <windows.h>
+#include <process.h>
+
+/* Standard C++ libaries */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Threads.hpp>
+
+//----------------------------------------------------------------------------
+// Driver for each created pthread
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+unsigned WINAPI internal_winthread_driver( void * arg )
+{
+  ThreadsExec::driver();
+
+  return 0 ;
+}
+
+class ThreadLockWindows {
+private:
+  CRITICAL_SECTION  m_handle ;
+
+  ~ThreadLockWindows()
+  { DeleteCriticalSection( & m_handle ); }
+
+  ThreadLockWindows();
+  { InitializeCriticalSection( & m_handle ); }
+
+  ThreadLockWindows( const ThreadLockWindows & );
+  ThreadLockWindows & operator = ( const ThreadLockWindows & );
+
+public:
+
+  static ThreadLockWindows & singleton();
+
+  void lock()
+  { EnterCriticalSection( & m_handle ); }
+
+  void unlock()
+  { LeaveCriticalSection( & m_handle ); }
+};
+
+ThreadLockWindows & ThreadLockWindows::singleton()
+{ static ThreadLockWindows self ; return self ; }
+
+} // namespace <>
+} // namespace Kokkos
+} // namespace Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Spawn this thread
+
+bool ThreadsExec::spawn()
+{
+  unsigned Win32ThreadID = 0 ;
+
+  HANDLE handle =
+    _beginthreadex(0,0,internal_winthread_driver,0,0, & Win32ThreadID );
+
+  return ! handle ;
+}
+
+bool ThreadsExec::is_process() { return true ; }
+
+void ThreadsExec::global_lock()
+{ ThreadLockWindows::singleton().lock(); }
+
+void ThreadsExec::global_unlock()
+{ ThreadLockWindows::singleton().unlock(); }
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
+{
+  while ( value == flag ) { Sleep(0); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* end #elif defined( KOKKOS_HAVE_WINTHREAD ) */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+
+
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@ -0,0 +1,730 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADSTEAM_HPP
+#define KOKKOS_THREADSTEAM_HPP
+
+#include <stdio.h>
+
+#include <utility>
+#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class > struct ThreadsExecAdapter ;
+
+//----------------------------------------------------------------------------
+
+class ThreadsExecTeamMember {
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  typedef Kokkos::Threads execution_space ;
+  typedef execution_space::scratch_memory_space space ;
+
+  ThreadsExec * const   m_exec ;
+  ThreadsExec * const * m_team_base ; ///< Base for team fan-in
+  space                 m_team_shared ;
+  int                   m_team_shared_size ;
+  int                   m_team_size ;
+  int                   m_team_rank ;
+  int                   m_team_rank_rev ;
+  int                   m_league_size ;
+  int                   m_league_end ;
+  int                   m_league_rank ;
+
+  inline
+  void set_team_shared()
+    { new( & m_team_shared ) space( ((char *) (*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE , m_team_shared_size ); }
+  
+public:
+
+  // Fan-in and wait until the matching fan-out is called.
+  // The root thread which does not wait will return true.
+  // All other threads will return false during the fan-out.
+  KOKKOS_INLINE_FUNCTION bool team_fan_in() const
+    {
+      int n , j ;
+
+      // Wait for fan-in threads
+      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_team_base[j]->state() , ThreadsExec::Active );
+      }
+
+      // If not root then wait for release
+      if ( m_team_rank_rev ) {
+        m_exec->state() = ThreadsExec::Rendezvous ;
+        Impl::spinwait( m_exec->state() , ThreadsExec::Rendezvous );
+      }
+
+      return ! m_team_rank_rev ;
+    }
+
+  KOKKOS_INLINE_FUNCTION void team_fan_out() const
+    {
+      int n , j ;
+      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
+        m_team_base[j]->state() = ThreadsExec::Active ;
+      }
+    }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION static int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+    {
+      team_fan_in();
+      team_fan_out();
+    }
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(ValueType& value, const int& thread_id) const
+  {
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { }
+#else
+    // Make sure there is enough scratch space:
+    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
+                         , ValueType , void >::type type ;
+
+    if ( m_team_base ) {
+      type * const local_value = ((type*) m_team_base[0]->scratch_memory());
+      if(team_rank() == thread_id) *local_value = value;
+      memory_fence();
+      team_barrier();
+      value = *local_value;
+    }
+#endif
+  }
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(Type) < TEAM_REDUCE_SIZE , Type , void >::type type ;
+
+      if ( 0 == m_exec ) return value ;
+
+      *((volatile type*) m_exec->scratch_memory() ) = value ;
+
+      memory_fence();
+
+      type & accum = *((type *) m_team_base[0]->scratch_memory() );
+
+      if ( team_fan_in() ) {
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          accum += *((type *) m_team_base[i]->scratch_memory() );
+        }
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return accum ;
+    }
+#endif
+
+#ifdef KOKKOS_HAVE_CXX11
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION ValueType
+    team_reduce( const ValueType & value
+               , const JoinOp & op_in ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ValueType(); }
+  #else
+    {
+      typedef ValueType value_type;
+      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
+  #endif
+#else // KOKKOS_HAVE_CXX11
+  template< class JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+    team_reduce( const typename JoinOp::value_type & value
+               , const JoinOp & op ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return typename JoinOp::value_type(); }
+  #else
+    {
+      typedef typename JoinOp::value_type value_type;
+  #endif
+#endif // KOKKOS_HAVE_CXX11
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      if ( 0 == m_exec ) return value ;
+
+      type * const local_value = ((type*) m_exec->scratch_memory());
+
+      // Set this thread's contribution
+      *local_value = value ;
+
+      // Fence to make sure the base team member has access:
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        type * const team_value = ((type*) m_team_base[0]->scratch_memory());
+
+        // Join to the team value:
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          op.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
+        }
+
+        // Team base thread may "lap" member threads so copy out to their local value.
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          *((type*) m_team_base[i]->scratch_memory()) = *team_value ;
+        }
+
+        // Fence to make sure all team members have access
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      // Value was changed by the team base
+      return *((type volatile const *) local_value);
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ArgType(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
+
+      if ( 0 == m_exec ) return type(0);
+
+      volatile type * const work_value  = ((type*) m_exec->scratch_memory());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_team_base[i]->scratch_memory());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_team_base[i]->scratch_memory());
+          const type offset = accum ;
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value ) const
+    { return this-> template team_scan<ArgType>( value , 0 ); }
+
+
+  //----------------------------------------
+  // Private for the driver
+
+  template< class Arg0 , class Arg1 >
+  ThreadsExecTeamMember( Impl::ThreadsExec * exec
+                       , const TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > & team 
+                       , const int shared_size )
+    : m_exec( exec )
+    , m_team_base(0)
+    , m_team_shared(0,0)
+    , m_team_shared_size( shared_size )
+    , m_team_size(0)
+    , m_team_rank(0)
+    , m_team_rank_rev(0)
+    , m_league_size(0)
+    , m_league_end(0)
+    , m_league_rank(0)
+    {
+      if ( team.league_size() ) {
+        // Execution is using device-team interface:
+
+        const int pool_rank_rev = m_exec->pool_size() - ( m_exec->pool_rank() + 1 );
+        const int team_rank_rev = pool_rank_rev % team.team_alloc();
+
+        // May be using fewer threads per team than a multiple of threads per core,
+        // some threads will idle.
+
+        if ( team_rank_rev < team.team_size() ) {
+          const size_t pool_league_size     = m_exec->pool_size() / team.team_alloc() ;
+          const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
+          const size_t pool_league_rank     = pool_league_size - ( pool_league_rank_rev + 1 );
+
+          m_team_base        = m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev ;
+          m_team_size        = team.team_size() ;
+          m_team_rank        = team.team_size() - ( team_rank_rev + 1 );
+          m_team_rank_rev    = team_rank_rev ;
+          m_league_size      = team.league_size();
+
+          m_league_rank      = ( team.league_size() *  pool_league_rank    ) / pool_league_size ;
+          m_league_end       = ( team.league_size() * (pool_league_rank+1) ) / pool_league_size ;
+
+          set_team_shared();
+        }
+      }
+    }
+
+  ThreadsExecTeamMember()
+    : m_exec(0)
+    , m_team_base(0)
+    , m_team_shared(0,0)
+    , m_team_shared_size(0)
+    , m_team_size(1)
+    , m_team_rank(0)
+    , m_team_rank_rev(0)
+    , m_league_size(1)
+    , m_league_end(0)
+    , m_league_rank(0)
+    {}
+
+  inline
+  ThreadsExec & threads_exec_team_base() const { return m_team_base ? **m_team_base : *m_exec ; }
+
+  bool valid() const
+    { return m_league_rank < m_league_end ; }
+
+  void next()
+    {
+      if ( ++m_league_rank < m_league_end ) {
+        team_barrier();
+        set_team_shared();
+      }
+    }
+
+  void set_league_shmem( const int arg_league_rank
+                       , const int arg_league_size
+                       , const int arg_shmem_size
+                       )
+    {
+      m_league_rank = arg_league_rank ;
+      m_league_size = arg_league_size ;
+      m_team_shared_size = arg_shmem_size ;
+      set_team_shared();
+    }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class Arg0 , class Arg1 >
+class TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >
+{
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+
+  inline
+  void init( const int league_size_request 
+           , const int team_size_request )
+   {
+      const int pool_size  = execution_space::thread_pool_size(0);
+      const int team_max   = execution_space::thread_pool_size(1);
+      const int team_grain = execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+   }
+
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicy       execution_policy ; 
+  typedef Kokkos::Threads  execution_space ;
+
+  typedef typename
+    Impl::if_c< ! Impl::is_same< Kokkos::Threads , Arg0 >::value , Arg0 , Arg1 >::type
+      work_tag ;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return execution_space::thread_pool_size(1); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & )
+    { return execution_space::thread_pool_size(2); }
+
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& )
+    { return execution_space::thread_pool_size(2); }
+
+  //----------------------------------------
+
+  inline int team_size() const { return m_team_size ; }
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    { init(league_size_request,team_size_request); (void) vector_length_request; }
+
+  TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    { init(league_size_request,team_size_request); (void) vector_length_request; }
+
+  typedef Impl::ThreadsExecTeamMember member_type ;
+
+  friend class Impl::ThreadsExecTeamMember ;
+};
+
+
+} /* namespace Kokkos */
+
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>
+TeamThreadRange(const Impl::ThreadsExecTeamMember& thread, const iType& count)
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>
+TeamThreadRange( const Impl::ThreadsExecTeamMember& thread
+               , const iType & begin
+               , const iType & end
+               )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,begin,end);
+}
+
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >
+  ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >(thread,count);
+}
+
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember> PerTeam(const Impl::ThreadsExecTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember> PerThread(const Impl::ThreadsExecTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>(thread);
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADSTEAM_HPP */
+
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@ -0,0 +1,427 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_PARALLEL_HPP
+#define KOKKOS_THREADS_PARALLEL_HPP
+
+#include <vector>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i );
+      }
+    }
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() ) );
+
+    exec.fan_in();
+  }
+
+public:
+
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      ThreadsExec::start( & ParallelFor::execute , this );
+
+      ThreadsExec::fence();
+    }
+};
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
+{
+private:
+
+  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >  Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+  const int          m_shared ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member ) const
+    { m_func( member ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member ) const
+    { m_func( TagType() , member ); }
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    typename Policy::member_type member( & exec , self.m_policy , self.m_shared );
+
+    for ( ; member.valid() ; member.next() ) {
+      self.ParallelFor::template driver< typename Policy::work_tag >( member );
+    }
+
+    exec.fan_in();
+  }
+
+public:
+
+  ParallelFor( const FunctorType & functor
+              , const Policy      & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+    {
+      ThreadsExec::resize_scratch( 0 , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelFor::execute , this );
+
+      ThreadsExec::fence();
+    }
+};
+
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
+  typedef typename Policy::work_tag                                   work_tag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i , update );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update );
+      }
+    }
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    driver( self.m_func
+          , ValueInit::init( self.m_func , exec.reduce_memory() )
+          , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() )
+          );
+
+    exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
+  }
+
+public:
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & functor ,
+                  const Policy       & policy ,
+                  const HostViewType & result_view )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , 0 );
+
+      ThreadsExec::start( & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
+
+      ThreadsExec::fence();
+
+      if ( result_view.ptr_on_device() ) {
+        const unsigned n = ValueTraits::value_count( m_func );
+        for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
+      }
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
+{
+private:
+
+  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >                 Policy ;
+  typedef typename Policy::work_tag                                   work_tag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+  const int          m_shared ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member
+             , reference_type update ) const
+    { m_func( member , update ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member
+             , reference_type update ) const
+    { m_func( TagType() , member , update ); }
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    // Initialize thread-local value
+    reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
+
+    typename Policy::member_type member( & exec , self.m_policy , self.m_shared );
+    for ( ; member.valid() ; member.next() ) {
+      self.ParallelReduce::template driver< work_tag >( member , update );
+    }
+
+    exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
+  }
+
+public:
+
+  ParallelReduce( const FunctorType & functor
+                , const Policy      & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelReduce::execute , this );
+
+      ThreadsExec::fence();
+    }
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType & functor
+                , const Policy      & policy
+                , const ViewType    & result )
+    : m_func( functor )
+    , m_policy( policy )
+    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
+
+      ThreadsExec::fence();
+
+      const unsigned n = ValueTraits::value_count( m_func );
+      for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
+  typedef typename Policy::work_tag                                   work_tag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const bool    final
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i , update , final );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const bool    final
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update , final );
+      }
+    }
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+
+    const typename Policy::WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
+
+    driver( self.m_func , update , false , range );
+
+    //  exec.<FunctorType,work_tag>scan_large( self.m_func );
+    exec.template scan_small<FunctorType,work_tag>( self.m_func );
+
+    driver( self.m_func , update , true , range );
+
+    exec.fan_in();
+  }
+
+public:
+
+  ParallelScan( const FunctorType & functor , const Policy & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_func ) , 0 );
+      ThreadsExec::start( & ParallelScan::execute , this );
+      ThreadsExec::fence();
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
+
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
@ -0,0 +1,599 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <stdio.h>
+#include <iostream>
+#include <sstream>
+#include <Threads/Kokkos_Threads_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember< Kokkos::Threads , void , void > Task ;
+
+namespace {
+
+int    volatile s_count_serial = 0 ;
+int    volatile s_count_team   = 0 ;
+Task * volatile s_ready_team   = 0 ;
+Task * volatile s_ready_serial = 0 ;
+Task * const    s_lock   = reinterpret_cast<Task*>( ~((unsigned long)0) );
+Task * const    s_denied = reinterpret_cast<Task*>( ~((unsigned long)0) - 1 );
+
+} /* namespace */
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Threads >::TaskPolicy
+  ( const unsigned arg_default_dependence_capacity
+  , const unsigned arg_team_size
+  )
+  : m_default_dependence_capacity( arg_default_dependence_capacity )
+  , m_team_size( arg_team_size )
+{
+  const int threads_total    = Threads::thread_pool_size(0);
+  const int threads_per_numa = Threads::thread_pool_size(1);
+  const int threads_per_core = Threads::thread_pool_size(2);
+
+  if ( 0 == arg_team_size ) {
+    // If a team task then claim for execution until count is zero
+    // Issue: team collectives cannot assume which pool members are in the team.
+    // Issue: team must only span a single NUMA region.
+
+    // If more than one thread per core then map cores to work team,
+    // else  map numa to work team.
+
+    if      ( 1 < threads_per_core ) m_team_size = threads_per_core ;
+    else if ( 1 < threads_per_numa ) m_team_size = threads_per_numa ;
+    else                             m_team_size = 1 ;
+  }
+
+  // Verify a valid team size
+  const bool valid_team_size =
+    ( 0 < m_team_size && m_team_size <= threads_total ) &&
+    (
+      ( 1                == m_team_size ) ||
+      ( threads_per_core == m_team_size ) ||
+      ( threads_per_numa == m_team_size )
+    );
+
+  if ( ! valid_team_size ) {
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Threads > ERROR"
+        << " invalid team_size(" << m_team_size << ")"
+        << " threads_per_core(" << threads_per_core << ")"
+        << " threads_per_numa(" << threads_per_numa << ")"
+        << " threads_total(" << threads_total << ")"
+        ;
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+
+  }
+}
+
+TaskPolicy< Kokkos::Threads >::member_type &
+TaskPolicy< Kokkos::Threads >::member_single()
+{
+  static member_type s ;
+  return s ;
+}
+
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Threads > & policy )
+{
+  typedef Kokkos::Impl::ThreadsExecTeamMember member_type ;
+
+  enum { BASE_SHMEM = 1024 };
+
+  void * const arg = reinterpret_cast<void*>( long( policy.m_team_size ) );
+
+  Kokkos::Impl::ThreadsExec::resize_scratch( 0 , member_type::team_reduce_size() + BASE_SHMEM );
+  Kokkos::Impl::ThreadsExec::start( & Impl::Task::execute_ready_tasks_driver , arg );
+  Kokkos::Impl::ThreadsExec::fence();
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+void Task::throw_error_verify_type()
+{
+  Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::verify_type ERROR");
+}
+
+void Task::deallocate( void * ptr )
+{
+  free( ptr );
+}
+
+void * Task::allocate( const unsigned n )
+{
+  void * const ptr = malloc(n);
+
+  return ptr ;
+}
+
+Task::~TaskMember()
+{
+}
+
+//----------------------------------------------------------------------------
+
+void Task::reschedule()
+{
+  // Reschedule transitions from executing back to waiting.
+  const int old_state = atomic_compare_exchange( & m_state , int(TASK_STATE_EXECUTING) , int(TASK_STATE_WAITING) );
+
+  if ( old_state != int(TASK_STATE_EXECUTING) ) {
+
+fprintf( stderr
+       , "reschedule ERROR task[%lx] state(%d)\n"
+       , (unsigned long) this
+       , old_state
+       );
+fflush(stderr);
+
+  }
+}
+
+void Task::schedule()
+{
+  //----------------------------------------
+  // State is either constructing or already waiting.
+  // If constructing then transition to waiting.
+
+  {
+    const int old_state = atomic_compare_exchange( & m_state , int(TASK_STATE_CONSTRUCTING) , int(TASK_STATE_WAITING) );
+    Task * const waitTask = *((Task * volatile const *) & m_wait );
+    Task * const next = *((Task * volatile const *) & m_next );
+
+    if ( s_denied == waitTask || 0 != next ||
+         ( old_state != int(TASK_STATE_CONSTRUCTING) &&
+           old_state != int(TASK_STATE_WAITING) ) ) {
+      fprintf(stderr,"Task::schedule task(0x%lx) STATE ERROR: state(%d) wait(0x%lx) next(0x%lx)\n"
+                    , (unsigned long) this
+                    , old_state
+                    , (unsigned long) waitTask
+                    , (unsigned long) next );
+      fflush(stderr);
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::Task spawn or respawn state error");
+    }
+  }
+
+  //----------------------------------------
+  // Insert this task into another dependence that is not complete
+  // Push on to the wait queue, fails if ( s_denied == m_dep[i]->m_wait )
+
+  bool insert_in_ready_queue = true ;
+
+  for ( int i = 0 ; i < m_dep_size && insert_in_ready_queue ; ) {
+
+    Task * const task_dep = m_dep[i] ;
+    Task * const head_value_old = *((Task * volatile *) & task_dep->m_wait );
+
+    if ( s_denied == head_value_old ) {
+      // Wait queue is closed, try again with the next queue
+      ++i ;
+    }
+    else {
+
+      // Wait queue is open and not locked.
+      // If CAS succeeds then have acquired the lock.
+
+      // Have exclusive access to this task.
+      // Assign m_next assuming a successfull insertion into the queue.
+      // Fence the memory assignment before attempting the CAS.
+
+      *((Task * volatile *) & m_next ) = head_value_old ;
+
+      memory_fence();
+
+      // Attempt to insert this task into the queue
+
+      Task * const wait_queue_head = atomic_compare_exchange( & task_dep->m_wait , head_value_old , this );
+
+      if ( head_value_old == wait_queue_head ) {
+        insert_in_ready_queue = false ;
+      }
+    }
+  }
+
+  //----------------------------------------
+  // All dependences are complete, insert into the ready list
+
+  if ( insert_in_ready_queue ) {
+
+    // Increment the count of ready tasks.
+    // Count is decremented when task is complete.
+
+    Task * volatile * queue = 0 ;
+
+    if ( m_serial ) {
+      atomic_increment( & s_count_serial );
+      queue = & s_ready_serial ;
+    }
+    else {
+      atomic_increment( & s_count_team );
+      queue = & s_ready_team ;
+    }
+
+    while ( insert_in_ready_queue ) {
+
+      Task * const head_value_old = *queue ;
+
+      if ( s_lock != head_value_old ) {
+        // Read the head of ready queue, if same as previous value then CAS locks the ready queue
+        // Only access via CAS
+
+        // Have exclusive access to this task, assign to head of queue, assuming successful insert
+        // Fence assignment before attempting insert.
+        *((Task * volatile *) & m_next ) = head_value_old ;
+
+        memory_fence();
+
+        Task * const ready_queue_head = atomic_compare_exchange( queue , head_value_old , this );
+
+        if ( head_value_old == ready_queue_head ) {
+          // Successful insert
+          insert_in_ready_queue = false ; // done
+        }
+      }
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void Task::assign( Task ** const lhs_ptr , Task * rhs )
+{
+  // Increment rhs reference count.
+  if ( rhs ) { atomic_increment( & rhs->m_ref_count ); }
+
+  // Assign the pointer and retrieve the previous value.
+
+  Task * const old_lhs = atomic_exchange( lhs_ptr , rhs );
+
+  if ( old_lhs ) {
+
+    // Decrement former lhs reference count.
+    // If reference count is zero task must be complete, then delete task.
+    // Task is ready for deletion when  wait == s_denied
+
+    int const count = atomic_fetch_add( & (old_lhs->m_ref_count) , -1 ) - 1 ;
+
+    // if 'count != 0' then 'old_lhs' may be deallocated before dereferencing
+    Task * const wait = count == 0 ? *((Task * const volatile *) & old_lhs->m_wait ) : (Task*) 0 ;
+
+    if ( count < 0 || ( count == 0 && wait != s_denied ) ) {
+
+      static const char msg_error_header[]  = "Kokkos::Impl::TaskManager<Kokkos::Threads>::assign ERROR deleting" ;
+
+      fprintf( stderr , "%s task(0x%lx) m_ref_count(%d) , m_wait(0x%ld)\n"
+                      , msg_error_header
+                      , (unsigned long) old_lhs
+                      , count
+                      , (unsigned long) wait );
+      fflush(stderr);
+
+      Kokkos::Impl::throw_runtime_exception( msg_error_header );
+    }
+
+    if ( count == 0 ) {
+      // When 'count == 0' this thread has exclusive access to 'old_lhs'
+      const Task::function_dealloc_type d = old_lhs->m_dealloc ;
+      (*d)( old_lhs );
+    }
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+Task * Task::get_dependence( int i ) const
+{
+  Task * const t = m_dep[i] ;
+
+  if ( Kokkos::Experimental::TASK_STATE_EXECUTING != m_state || i < 0 || m_dep_size <= i || 0 == t ) {
+
+fprintf( stderr
+       , "TaskMember< Threads >::get_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) dep[%d] = %lx }\n"
+       , (unsigned long) this
+       , m_state
+       , m_dep_size
+       , i
+       , (unsigned long) t
+       );
+fflush( stderr );
+
+    Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::get_dependence ERROR");
+  }
+
+  return t ;
+}
+
+//----------------------------------------------------------------------------
+
+void Task::add_dependence( Task * before )
+{
+  if ( before != 0 ) {
+
+    int const state = *((volatile const int *) & m_state );
+
+    // Can add dependence during construction or during execution
+
+    if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == state ||
+           Kokkos::Experimental::TASK_STATE_EXECUTING    == state ) &&
+         m_dep_size < m_dep_capacity ) {
+
+      ++m_dep_size ;
+
+      assign( m_dep + (m_dep_size-1) , before );
+
+      memory_fence();
+    }
+    else {
+
+fprintf( stderr
+       , "TaskMember< Threads >::add_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) m_dep_capacity(%d) }\n"
+       , (unsigned long) this
+       , m_state
+       , m_dep_size
+       , m_dep_capacity
+       );
+fflush( stderr );
+
+      Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::add_dependence ERROR");
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void Task::clear_dependence()
+{
+  for ( int i = m_dep_size - 1 ; 0 <= i ; --i ) {
+    assign( m_dep + i , 0 );
+  }
+
+  *((volatile int *) & m_dep_size ) = 0 ;
+
+  memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+Task * Task::pop_ready_task( Task * volatile * const queue )
+{
+  Task * const task_old = *queue ;
+
+  if ( s_lock != task_old && 0 != task_old ) {
+
+    Task * const task = atomic_compare_exchange( queue , task_old , s_lock );
+
+    if ( task_old == task ) {
+
+      // May have acquired the lock and task.
+      // One or more other threads may have acquired this same task and lock
+      // due to respawning ABA race condition.
+      // Can only be sure of acquire with a successful state transition from waiting to executing
+
+      const int old_state = atomic_compare_exchange( & task->m_state, int(TASK_STATE_WAITING), int(TASK_STATE_EXECUTING) );
+
+      if ( old_state == int(TASK_STATE_WAITING) ) {
+
+        // Transitioned this task from waiting to executing
+        // Update the queue to the next entry and release the lock
+
+        Task * const next_old = *((Task * volatile *) & task->m_next );
+
+        Task * const s = atomic_compare_exchange( queue , s_lock , next_old );
+
+        if ( s != s_lock ) {
+          fprintf(stderr,"Task::pop_ready_task( 0x%lx ) UNLOCK ERROR\n", (unsigned long) queue );
+          fflush(stderr);
+        }
+
+        *((Task * volatile *) & task->m_next ) = 0 ;
+
+        return task ;
+      }
+      else {
+        fprintf(stderr,"Task::pop_ready_task( 0x%lx ) task(0x%lx) state(%d) ERROR\n"
+                      , (unsigned long) queue
+                      , (unsigned long) task
+                      , old_state );
+        fflush(stderr);
+      }
+    }
+  }
+
+  return (Task *) 0 ;
+}
+
+
+void Task::complete_executed_task( Task * task , volatile int * const queue_count )
+{
+  // State is either executing or if respawned then waiting,
+  // try to transition from executing to complete.
+  // Reads the current value.
+
+  const int state_old =
+    atomic_compare_exchange( & task->m_state
+                           , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                           , int(Kokkos::Experimental::TASK_STATE_COMPLETE) );
+
+  if ( Kokkos::Experimental::TASK_STATE_WAITING == state_old ) {
+    task->schedule(); /* Task requested a respawn so reschedule it */
+  }
+  else if ( Kokkos::Experimental::TASK_STATE_EXECUTING != state_old ) {
+    fprintf( stderr
+           , "TaskMember< Threads >::execute_serial completion ERROR : task[%lx]{ state_old(%d) dep_size(%d) }\n"
+           , (unsigned long) & task
+           , state_old
+           , task->m_dep_size
+           );
+    fflush( stderr );
+  }
+  else {
+
+    // Clear dependences of this task before locking wait queue
+
+    task->clear_dependence();
+
+    // Stop other tasks from adding themselves to this task's wait queue.
+    // The wait queue is updated concurrently so guard with an atomic.
+    // Setting the wait queue to denied denotes delete-ability of the task by any thread.
+    // Therefore, once 'denied' the task pointer must be treated as invalid.
+
+    Task * wait_queue     = *((Task * volatile *) & task->m_wait );
+    Task * wait_queue_old = 0 ;
+
+    do {
+      wait_queue_old = wait_queue ;
+      wait_queue     = atomic_compare_exchange( & task->m_wait , wait_queue_old , s_denied );
+    } while ( wait_queue_old != wait_queue );
+
+    task = 0 ;
+
+    // Pop waiting tasks and schedule them
+    while ( wait_queue ) {
+      Task * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
+      x->schedule();
+    }
+  }
+
+  atomic_decrement( queue_count );
+}
+
+//----------------------------------------------------------------------------
+
+void Task::execute_ready_tasks_driver( Kokkos::Impl::ThreadsExec & exec , const void * arg )
+{
+  typedef Kokkos::Impl::ThreadsExecTeamMember member_type ;
+
+  // Whole pool is calling this function
+
+  // Create the thread team member with shared memory for the given task.
+  const int team_size = reinterpret_cast<long>( arg );
+
+  member_type member( & exec , TeamPolicy< Kokkos::Threads >( 1 , team_size ) , 0 );
+
+  Kokkos::Impl::ThreadsExec & exec_team_base = member.threads_exec_team_base();
+
+  Task * volatile * const task_team_ptr = reinterpret_cast<Task**>( exec_team_base.reduce_memory() );
+
+  if ( member.team_fan_in() ) {
+    *task_team_ptr = 0 ;
+    Kokkos::memory_fence();
+  }
+  member.team_fan_out();
+
+  long int iteration_count = 0 ;
+
+  // Each team must iterate this loop synchronously to insure team-execution of team-task
+
+  while ( 0 < s_count_serial || 0 < s_count_team ) {
+
+    if ( member.team_rank() == 0 ) {
+      // Only one team member attempts to pop a team task
+      *task_team_ptr = pop_ready_task( & s_ready_team );
+    }
+
+    // Query if team acquired a team task
+    Task * const task_team = *task_team_ptr ;
+
+    if ( task_team ) {
+      // Set shared memory
+      member.set_league_shmem( 0 , 1 , task_team->m_shmem_size );
+
+      (*task_team->m_team)( task_team , member );
+
+      // Do not proceed until all members have completed the task,
+      // the task has been completed or rescheduled, and
+      // the team task pointer has been cleared.
+      if ( member.team_fan_in() ) {
+        complete_executed_task( task_team , & s_count_team );
+        *task_team_ptr = 0 ;
+        Kokkos::memory_fence();
+      }
+      member.team_fan_out();
+    }
+    else {
+      Task * const task_serial = pop_ready_task( & s_ready_serial );
+
+      if ( task_serial ) {
+        if ( task_serial->m_serial ) (*task_serial->m_serial)( task_serial );
+
+        complete_executed_task( task_serial , & s_count_serial );
+      }
+    }
+
+    ++iteration_count ;
+  }
+
+  exec.fan_in();
+}
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
@ -0,0 +1,584 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_THREADS_TASKPOLICY_HPP
+#define KOKKOS_THREADS_TASKPOLICY_HPP
+
+
+#include <Kokkos_Threads.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  Base class for all Kokkos::Threads tasks */
+template<>
+class TaskMember< Kokkos::Threads , void , void > {
+public:
+
+  typedef void         (* function_dealloc_type)( TaskMember * );
+  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+  typedef void         (* function_single_type) ( TaskMember * );
+  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::ThreadsExecTeamMember & );
+
+private:
+
+  // Needed to disambiguate references to base class variables
+  // without triggering a false-positive on Intel compiler warning #955.
+  typedef TaskMember< Kokkos::Threads , void , void > SelfType ;
+
+  function_dealloc_type  m_dealloc ;      ///< Deallocation
+  function_verify_type   m_verify ;       ///< Result type verification
+  function_team_type     m_team ;         ///< Apply function
+  function_single_type   m_serial ;       ///< Apply function
+  TaskMember **          m_dep ;          ///< Dependences
+  TaskMember *           m_wait ;         ///< Linked list of tasks waiting on this task
+  TaskMember *           m_next ;         ///< Linked list of tasks waiting on a different task
+  int                    m_dep_capacity ; ///< Capacity of dependences
+  int                    m_dep_size ;     ///< Actual count of dependences
+  int                    m_shmem_size ;
+  int                    m_ref_count ;    ///< Reference count
+  int                    m_state ;        ///< State of the task
+
+  // 7 pointers + 5 integers
+
+#if defined( KOKKOS_HAVE_CXX11 )
+  TaskMember( const TaskMember & ) = delete ;
+  TaskMember & operator = ( const TaskMember & ) = delete ;
+#else
+  TaskMember( const TaskMember & );
+  TaskMember & operator = ( const TaskMember & );
+#endif
+
+  static void * allocate( const unsigned arg_size );
+  static void deallocate( void * );
+
+  template< class DerivedTaskType >
+  static
+  void deallocate( TaskMember * t )
+    {
+      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
+      ptr->~DerivedTaskType();
+      deallocate( (void*) ptr );
+    }
+
+  static TaskMember * pop_ready_task( TaskMember * volatile * const queue );
+  static void complete_executed_task( TaskMember * , volatile int * const );
+
+  static void throw_error_verify_type();
+
+protected:
+
+  TaskMember()
+    : m_dealloc(0)
+    , m_verify(0)
+    , m_team(0)
+    , m_serial(0)
+    , m_dep(0)
+    , m_wait(0)
+    , m_next(0)
+    , m_dep_capacity(0)
+    , m_dep_size(0)
+    , m_shmem_size(0)
+    , m_ref_count(0)
+    , m_state(0)
+    {}
+
+public:
+
+  static void execute_ready_tasks_driver( Kokkos::Impl::ThreadsExec & , const void * );
+
+  ~TaskMember();
+
+  template< typename ResultType >
+  KOKKOS_FUNCTION static
+  TaskMember * verify_type( TaskMember * t )
+    {
+      enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
+
+      if ( check_type && t != 0 ) {
+
+        // Verify that t->m_verify is this function
+        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
+
+        if ( t->m_verify != self ) {
+          t = 0 ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+          throw_error_verify_type();
+#endif
+        }
+      }
+      return t ;
+    }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *
+   *    class DerivedTaskType
+   *      : public TaskMember< Threads , DerivedType::value_type , FunctorType >
+   *      { ... };
+   *
+   *    class TaskMember< Threads , DerivedType::value_type , FunctorType >
+   *      : public TaskMember< Threads , DerivedType::value_type , void >
+   *      , public Functor
+   *      { ... };
+   *
+   *  If value_type != void
+   *    class TaskMember< Threads , value_type , void >
+   *      : public TaskMember< Threads , void , void >
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   *
+   */
+  //----------------------------------------
+
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< typename DerivedTaskType::result_type , void >::value
+                                                , TaskMember * >::type t )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename DerivedTaskType::result_type   result_type ;
+
+      DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
+
+      Kokkos::Impl::FunctorApply< functor_type , Tag , result_type & >
+        ::apply( (functor_type &) self , & self.m_result );
+    }
+
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< typename DerivedTaskType::result_type , void >::value
+                                                , TaskMember * >::type t )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+
+      DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
+
+      Kokkos::Impl::FunctorApply< functor_type , Tag , void >::apply( (functor_type &) self );
+    }
+
+  //----------------------------------------
+
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_team( typename Kokkos::Impl::enable_if<(
+                     Kokkos::Impl::is_same<Tag,void>::value
+                     &&
+                     Kokkos::Impl::is_same<typename DerivedTaskType::result_type,void>::value
+                   ), TaskMember * >::type t
+                 , Kokkos::Impl::ThreadsExecTeamMember & member
+                 )
+    {
+      DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
+
+      self.DerivedTaskType::functor_type::apply( member );
+    }
+
+  /** \brief  Allocate and construct a task */
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_team( typename Kokkos::Impl::enable_if<(
+                     Kokkos::Impl::is_same<Tag,void>::value
+                     &&
+                     ! Kokkos::Impl::is_same<typename DerivedTaskType::result_type,void>::value
+                   ), TaskMember * >::type t
+                 , Kokkos::Impl::ThreadsExecTeamMember & member
+                 )
+    {
+      DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
+
+      self.DerivedTaskType::functor_type::apply( member , self.m_result );
+    }
+
+  //----------------------------------------
+
+  /** \brief  Allocate and construct a task */
+  template< class DerivedTaskType , class Tag >
+  static
+  TaskMember * create( const typename DerivedTaskType::functor_type &  arg_functor
+                     , const function_team_type                        arg_apply_team
+                     , const function_single_type                      arg_apply_single
+                     , const unsigned                                  arg_team_shmem
+                     , const unsigned                                  arg_dependence_capacity
+                     )
+    {
+      enum { padding_size = sizeof(DerivedTaskType) % sizeof(TaskMember*)
+                          ? sizeof(TaskMember*) - sizeof(DerivedTaskType) % sizeof(TaskMember*) : 0 };
+      enum { derived_size = sizeof(DerivedTaskType) + padding_size };
+
+      DerivedTaskType * const task =
+        new( allocate( derived_size + sizeof(TaskMember*) * arg_dependence_capacity ) )
+          DerivedTaskType( arg_functor );
+
+      task->SelfType::m_dealloc      = & TaskMember::template deallocate< DerivedTaskType > ;
+      task->SelfType::m_verify       = & TaskMember::template verify_type< typename DerivedTaskType::value_type > ;
+      task->SelfType::m_team         = arg_apply_team ;
+      task->SelfType::m_serial       = arg_apply_single ;
+      task->SelfType::m_dep          = (TaskMember**)( ((unsigned char *)task) + derived_size );
+      task->SelfType::m_dep_capacity = arg_dependence_capacity ;
+      task->SelfType::m_shmem_size   = arg_team_shmem ;
+      task->SelfType::m_state        = TASK_STATE_CONSTRUCTING ;
+
+      for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) task->SelfType::m_dep[i] = 0 ;
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  void reschedule();
+  void schedule();
+
+  //----------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs );
+#else
+  KOKKOS_INLINE_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs ) {}
+#endif
+
+  TaskMember * get_dependence( int i ) const ;
+
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence() const
+    { return m_dep_size ; }
+
+  void clear_dependence();
+  void add_dependence( TaskMember * before );
+
+  //----------------------------------------
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+};
+
+/** \brief  A Future< Kokkos::Threads , ResultType > will cast
+ *          from  TaskMember< Kokkos::Threads , void , void >
+ *          to    TaskMember< Kokkos::Threads , ResultType , void >
+ *          to query the result.
+ */
+template< class ResultType >
+class TaskMember< Kokkos::Threads , ResultType , void >
+  : public TaskMember< Kokkos::Threads , void , void >
+{
+public:
+
+  typedef ResultType result_type ;
+
+  result_type  m_result ;
+
+  typedef const result_type & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+  inline
+  TaskMember() : TaskMember< Kokkos::Threads , void , void >(), m_result() {}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+  TaskMember( const TaskMember & ) = delete ;
+  TaskMember & operator = ( const TaskMember & ) = delete ;
+#else
+private:
+  TaskMember( const TaskMember & );
+  TaskMember & operator = ( const TaskMember & );
+#endif
+};
+
+/** \brief  Callback functions will cast
+ *          from  TaskMember< Kokkos::Threads , void , void >
+ *          to    TaskMember< Kokkos::Threads , ResultType , FunctorType >
+ *          to execute work functions.
+ */
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Threads , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Threads , ResultType , void >
+  , public FunctorType
+{
+public:
+  typedef ResultType   result_type ;
+  typedef FunctorType  functor_type ;
+
+  inline
+  TaskMember( const functor_type & arg_functor )
+    : TaskMember< Kokkos::Threads , ResultType , void >()
+    , functor_type( arg_functor )
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+void wait( TaskPolicy< Kokkos::Threads > & );
+
+template<>
+class TaskPolicy< Kokkos::Threads >
+{
+public:
+
+  typedef Kokkos::Threads                      execution_space ;
+  typedef TaskPolicy                           execution_policy ;
+  typedef Kokkos::Impl::ThreadsExecTeamMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< Kokkos::Threads , void , void >  task_root_type ;
+
+  int m_default_dependence_capacity ;
+  int m_team_size ;    ///< Fixed size of a task-team
+
+  template< class FunctorType >
+  static inline
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  static inline
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+public:
+
+  // Valid team sizes are 1,
+  // Threads::pool_size(1) == threads per numa, or
+  // Threads::pool_size(2) == threads per core
+
+  TaskPolicy( const unsigned arg_default_dependence_capacity = 4
+            , const unsigned arg_team_size = 0 /* default from thread pool topology */
+            );
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy & rhs )
+    : m_default_dependence_capacity( rhs.m_default_dependence_capacity )
+    , m_team_size( rhs.m_team_size )
+    {}
+  
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy & rhs
+            , const unsigned arg_default_dependence_capacity )
+    : m_default_dependence_capacity( arg_default_dependence_capacity )
+    , m_team_size( rhs.m_team_size )
+    {}
+
+  TaskPolicy & operator = ( const TaskPolicy &rhs ) {
+    m_default_dependence_capacity = rhs.m_default_dependence_capacity;
+    m_team_size = rhs.m_team_size;
+    return *this;
+  }
+
+  // Create serial-thread task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create( const FunctorType & functor
+        , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type , void >
+          ( functor
+          , task_root_type::function_team_type(0)
+          , & task_root_type::template apply_single< task_type , void >
+          , 0
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          )
+#endif
+        );
+    }
+
+  // Create thread-team task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_team( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type , void >
+          ( functor
+          , & task_root_type::template apply_team< task_type , void >
+          , task_root_type::function_single_type(0)
+          , Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value( functor , m_team_size )
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          )
+#endif
+        );
+    }
+
+  template< class A1 , class A2 , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      after.m_task->add_dependence( before.m_task );
+#endif
+    }
+
+  template< class FunctorType , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->add_dependence( before.m_task ); }
+#else
+    {}
+#endif
+
+
+  template< class ValueType >
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f ) const
+      {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        f.m_task->schedule();
+#endif
+        return f ;
+      }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void respawn( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->reschedule(); }
+#else
+    {}
+#endif
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        get_task_root(task_functor)->get_dependence(i)
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence( const FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return get_task_root(task_functor)->get_dependence(); }
+#else
+    { return 0 ; }
+#endif
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->clear_dependence(); }
+#else
+    {}
+#endif
+
+  //----------------------------------------
+
+  static member_type & member_single();
+
+  friend void wait( TaskPolicy< Kokkos::Threads > & );
+};
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */
+
+
--- a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp
@ -0,0 +1,275 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+bool
+SharedAllocationRecord< void , void >::
+is_sane( SharedAllocationRecord< void , void > * arg_record )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  SharedAllocationRecord * const root = arg_record ? arg_record->m_root : 0 ;
+
+  bool ok = root != 0 && root->m_count == 0 ;
+
+  if ( ok ) {
+    SharedAllocationRecord * root_next = 0 ;
+
+    // Lock the list:
+    while ( ( root_next = Kokkos::atomic_exchange( & root->m_next , zero ) ) == 0 );
+
+    for ( SharedAllocationRecord * rec = root_next ; ok && rec != root ; rec = rec->m_next ) {
+      const bool ok_non_null  = rec && rec->m_prev && ( rec == root || rec->m_next );
+      const bool ok_root      = ok_non_null && rec->m_root == root ;
+      const bool ok_prev_next = ok_non_null && ( rec->m_prev != root ? rec->m_prev->m_next == rec : root_next == rec );
+      const bool ok_next_prev = ok_non_null && rec->m_next->m_prev == rec ;
+      const bool ok_count     = ok_non_null && 0 <= rec->m_count ;
+
+      ok = ok_root && ok_prev_next && ok_next_prev && ok_count ;
+
+if ( ! ok ) {
+  fprintf(stderr,"Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) m_prev->m_next(0x%.12lx) }\n"
+        , reinterpret_cast< unsigned long >( rec )
+        , rec->m_count
+        , reinterpret_cast< unsigned long >( rec->m_root )
+        , reinterpret_cast< unsigned long >( rec->m_next )
+        , reinterpret_cast< unsigned long >( rec->m_prev )
+        , reinterpret_cast< unsigned long >( rec->m_next->m_prev )
+        , reinterpret_cast< unsigned long >( rec->m_prev != rec->m_root ? rec->m_prev->m_next : root_next )
+        );
+}
+
+    }
+
+    if ( zero != Kokkos::atomic_exchange( & root->m_next , root_next ) ) {
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane unlocking");
+    }
+  }
+
+  return ok ; 
+}
+
+SharedAllocationRecord<void,void> *
+SharedAllocationRecord<void,void>::find( SharedAllocationRecord<void,void> * const arg_root , void * const arg_data_ptr )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  SharedAllocationRecord * root_next = 0 ;
+
+  // Lock the list:
+  while ( ( root_next = Kokkos::atomic_exchange( & arg_root->m_next , 0 ) ) == 0 );
+
+  // Iterate searching for the record with this data pointer
+
+  SharedAllocationRecord * r = root_next ;
+
+  while ( ( r != arg_root ) && ( r->data() != arg_data_ptr ) ) { r = r->m_next ; }
+
+  if ( r == arg_root ) { r = 0 ; }
+
+  if ( zero != Kokkos::atomic_exchange( & arg_root->m_next , root_next ) ) {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed locking/unlocking");
+  }
+
+  return r ;
+}
+
+
+/**\brief  Construct and insert into 'arg_root' tracking set.
+ *         use_count is zero.
+ */
+SharedAllocationRecord< void , void >::
+SharedAllocationRecord( SharedAllocationRecord<void,void> * arg_root
+                      , SharedAllocationHeader            * arg_alloc_ptr
+                      , size_t                              arg_alloc_size
+                      , SharedAllocationRecord< void , void >::function_type  arg_dealloc
+                      )
+  : m_alloc_ptr(  arg_alloc_ptr )
+  , m_alloc_size( arg_alloc_size )
+  , m_dealloc(    arg_dealloc )
+  , m_root( arg_root )
+  , m_prev( 0 )
+  , m_next( 0 )
+  , m_count( 0 )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  // Insert into the root double-linked list for tracking
+  //
+  // before:  arg_root->m_next == next ; next->m_prev == arg_root
+  // after:   arg_root->m_next == this ; this->m_prev == arg_root ;
+  //              this->m_next == next ; next->m_prev == this
+
+  m_prev = m_root ;
+
+  // Read root->m_next and lock by setting to zero
+  while ( ( m_next = Kokkos::atomic_exchange( & m_root->m_next , zero ) ) == 0 );
+
+  m_next->m_prev = this ;
+
+  if ( zero != Kokkos::atomic_exchange( & m_root->m_next , this ) ) {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed locking/unlocking");
+  }
+}
+
+void
+SharedAllocationRecord< void , void >::
+increment( SharedAllocationRecord< void , void > * arg_record )
+{
+  const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , 1 );
+
+  if ( old_count < 0 ) { // Error
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed increment");
+  }
+}
+
+SharedAllocationRecord< void , void > *
+SharedAllocationRecord< void , void >::
+decrement( SharedAllocationRecord< void , void > * arg_record )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , -1 );
+
+  if ( old_count == 1 ) {
+
+    // before:  arg_record->m_prev->m_next == arg_record  &&
+    //          arg_record->m_next->m_prev == arg_record
+    //
+    // after:   arg_record->m_prev->m_next == arg_record->m_next  &&
+    //          arg_record->m_next->m_prev == arg_record->m_prev
+
+    SharedAllocationRecord * root_next = 0 ;
+
+    // Lock the list:
+    while ( ( root_next = Kokkos::atomic_exchange( & arg_record->m_root->m_next , 0 ) ) == 0 );
+
+    arg_record->m_next->m_prev = arg_record->m_prev ;
+
+    if ( root_next != arg_record ) {
+      arg_record->m_prev->m_next = arg_record->m_next ;
+    }
+    else {
+      // before:  arg_record->m_root == arg_record->m_prev
+      // after:   arg_record->m_root == arg_record->m_next
+      root_next = arg_record->m_next ; 
+    }
+
+    // Unlock the list:
+    if ( zero != Kokkos::atomic_exchange( & arg_record->m_root->m_next , root_next ) ) {
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed decrement unlocking");
+    }
+
+    arg_record->m_next = 0 ;
+    arg_record->m_prev = 0 ;
+
+    function_type d = arg_record->m_dealloc ;
+    (*d)( arg_record );
+    arg_record = 0 ;
+  }
+  else if ( old_count < 1 ) { // Error
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed decrement count");
+  }
+
+  return arg_record ;
+}
+
+void
+SharedAllocationRecord< void , void >::
+print_host_accessible_records( std::ostream & s
+                             , const char * const space_name
+                             , const SharedAllocationRecord * const root
+                             , const bool detail )
+{
+  const SharedAllocationRecord< void , void > * r = root ;
+
+  char buffer[256] ;
+
+  if ( detail ) {
+    do {
+
+      snprintf( buffer , 256 , "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"
+              , space_name
+              , reinterpret_cast<unsigned long>( r )
+              , reinterpret_cast<unsigned long>( r->m_prev )
+              , reinterpret_cast<unsigned long>( r->m_next )
+              , reinterpret_cast<unsigned long>( r->m_alloc_ptr )
+              , r->m_alloc_size
+              , r->m_count
+              , reinterpret_cast<unsigned long>( r->m_dealloc )
+              , r->m_alloc_ptr->m_label
+              );
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != root );
+  }
+  else {
+    do {
+      if ( r->m_alloc_ptr ) {
+
+        snprintf( buffer , 256 , "%s [ 0x%.12lx + %ld ] %s\n"
+                , space_name
+                , reinterpret_cast< unsigned long >( r->data() )
+                , r->size()
+                , r->m_alloc_ptr->m_label
+                );
+      }
+      else {
+        snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name );
+      }
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != root );
+  }
+}
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+
--- a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
@ -0,0 +1,287 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class MemorySpace = void , class DestroyFunctor = void >
+class SharedAllocationRecord ;
+
+class SharedAllocationHeader {
+private:
+
+  typedef SharedAllocationRecord<void,void>  Record ;
+
+  static constexpr unsigned maximum_label_length = ( 1u << 7 /* 128 */ ) - sizeof(Record*);
+
+  template< class , class > friend class SharedAllocationRecord ;
+
+  Record * m_record ;
+  char     m_label[ maximum_label_length ];
+
+public:
+
+  /* Given user memory get pointer to the header */
+  KOKKOS_INLINE_FUNCTION static
+  const SharedAllocationHeader * get_header( void * alloc_ptr )
+    { return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); }
+};
+
+template<>
+class SharedAllocationRecord< void , void > {
+protected:
+
+  static_assert( sizeof(SharedAllocationHeader) == ( 1u << 7 /* 128 */ ) , "sizeof(SharedAllocationHeader) != 128" );
+
+  template< class , class > friend class SharedAllocationRecord ;
+
+  typedef void (* function_type )( SharedAllocationRecord<void,void> * );
+
+  SharedAllocationHeader * const m_alloc_ptr ;
+  size_t                   const m_alloc_size ;
+  function_type            const m_dealloc ;
+  SharedAllocationRecord * const m_root ;
+  SharedAllocationRecord *       m_prev ;
+  SharedAllocationRecord *       m_next ;
+  int                            m_count ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  /**\brief  Construct and insert into 'arg_root' tracking set.
+   *         use_count is zero.
+   */
+  SharedAllocationRecord( SharedAllocationRecord * arg_root
+                        , SharedAllocationHeader * arg_alloc_ptr
+                        , size_t                   arg_alloc_size
+                        , function_type            arg_dealloc
+                        );
+
+public:
+
+  ~SharedAllocationRecord() = default ;
+
+  constexpr SharedAllocationRecord()
+    : m_alloc_ptr( 0 )
+    , m_alloc_size( 0 )
+    , m_dealloc( 0 )
+    , m_root( this )
+    , m_prev( this )
+    , m_next( this )
+    , m_count( 0 )
+    {}
+
+  static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length ;
+
+  KOKKOS_INLINE_FUNCTION
+  const SharedAllocationHeader * head() const { return m_alloc_ptr ; }
+
+  /* User's memory begins at the end of the header */
+  KOKKOS_INLINE_FUNCTION
+  void * data() const { return reinterpret_cast<void*>( m_alloc_ptr + 1 ); }
+
+  /* User's memory begins at the end of the header */
+  constexpr size_t size() const { return m_alloc_size - sizeof(SharedAllocationHeader) ; }
+
+  /* Cannot be 'constexpr' because 'm_count' is volatile */
+  int use_count() const { return m_count ; }
+
+  /* Increment use count */
+  static void increment( SharedAllocationRecord * );
+
+  /* Decrement use count. If 1->0 then remove from the tracking list and invoke m_dealloc */
+  static SharedAllocationRecord * decrement( SharedAllocationRecord * );
+
+  /* Given a root record and data pointer find the record */
+  static SharedAllocationRecord * find( SharedAllocationRecord * const , void * const );
+
+  /*  Sanity check for the whole set of records to which the input record belongs.
+   *  Locks the set's insert/erase operations until the sanity check is complete.
+   */
+  static bool is_sane( SharedAllocationRecord * );
+
+  /*  Print host-accessible records */
+  static void print_host_accessible_records( std::ostream &
+                                           , const char * const space_name
+                                           , const SharedAllocationRecord * const root
+                                           , const bool detail );
+};
+
+/*
+ *  Memory space specialization of SharedAllocationRecord< Space , void > requires :
+ *
+ *  SharedAllocationRecord< Space , void > : public SharedAllocationRecord< void , void >
+ *  {
+ *    // delete allocated user memory via static_cast to this type.
+ *    static void deallocate( const SharedAllocationRecord<void,void> * );
+ *    Space m_space ;
+ *  }
+ */
+
+template< class MemorySpace , class DestroyFunctor >
+class SharedAllocationRecord : public SharedAllocationRecord< MemorySpace , void >
+{
+private:
+
+  static void deallocate( SharedAllocationRecord<void,void> * record_ptr )
+    { delete static_cast<SharedAllocationRecord<MemorySpace,DestroyFunctor>*>(record_ptr); }
+
+  SharedAllocationRecord( const MemorySpace & arg_space
+                        , const std::string & arg_label
+                        , const size_t        arg_alloc
+                        )
+    /*  Allocate user memory as [ SharedAllocationHeader , user_memory ] */
+    : SharedAllocationRecord< MemorySpace , void >( arg_space , arg_label , arg_alloc , & deallocate )
+    , m_destroy()
+    {}
+
+  ~SharedAllocationRecord() { m_destroy.destroy_shared_allocation(); }
+
+public:
+
+  DestroyFunctor  m_destroy ;
+
+  // Allocate with a zero use count.  Incrementing the use count from zero to one
+  // inserts the record into the tracking list.  Decrementing the count from one to zero
+  // removes from the trakcing list and deallocates.
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const MemorySpace & arg_space 
+                                   , const std::string & arg_label
+                                   , const size_t        arg_alloc
+                                   )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc );
+#else
+      return (SharedAllocationRecord *) 0 ;
+#endif
+    }
+};
+
+union SharedAllocationTracker {
+private:
+
+  typedef SharedAllocationRecord<void,void>  Record ;
+
+  enum : unsigned long {
+    DO_NOT_DEREF_FLAG = 0x01ul
+  };
+
+  // The allocation record resides in Host memory space
+  Record * m_record ;
+  unsigned long m_record_bits;
+
+  KOKKOS_INLINE_FUNCTION
+  static Record * disable( Record * rec )
+    { return reinterpret_cast<Record*>( reinterpret_cast<unsigned long>( rec ) & DO_NOT_DEREF_FLAG ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void increment() const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::increment( m_record );
+#endif
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void decrement() const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+       if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::decrement( m_record );
+#endif
+    }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr SharedAllocationTracker() : m_record_bits( DO_NOT_DEREF_FLAG ) {}
+
+  template< class MemorySpace >
+  constexpr
+  SharedAllocationRecord< MemorySpace , void > & get_record() const
+    { return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
+
+  template< class MemorySpace >
+  std::string get_label() const
+    { return static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record )->get_label(); }
+
+  KOKKOS_INLINE_FUNCTION
+  SharedAllocationTracker( Record * arg_record )
+    : m_record( arg_record ) { increment(); }
+
+  KOKKOS_INLINE_FUNCTION
+  ~SharedAllocationTracker() { decrement(); }
+
+  KOKKOS_INLINE_FUNCTION
+  SharedAllocationTracker( const SharedAllocationTracker & rhs )
+    : m_record( rhs.m_record ) { increment(); }
+
+  KOKKOS_INLINE_FUNCTION
+  SharedAllocationTracker( SharedAllocationTracker && rhs )
+    : m_record( rhs.m_record ) { rhs.m_record_bits = DO_NOT_DEREF_FLAG ; }
+
+  KOKKOS_INLINE_FUNCTION
+  SharedAllocationTracker & operator = ( const SharedAllocationTracker & rhs )
+    {
+      decrement();
+      m_record = rhs.m_record ;
+      increment();
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  SharedAllocationTracker & operator = ( SharedAllocationTracker && rhs )
+    {
+      m_record = rhs.m_record ;
+      rhs.m_record_bits = DO_NOT_DEREF_FLAG ;
+      return *this ;
+    }
+};
+
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+
--- a/lib/kokkos/core/src/impl/KokkosExp_ViewAllocProp.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewAllocProp.hpp
@ -0,0 +1,416 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_IMPL_VIEW_ALLOC_PROP_HPP
+#define KOKKOS_EXPERIMENTAL_IMPL_VIEW_ALLOC_PROP_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct WithoutInitializing_t {};
+struct AllowPadding_t {};
+
+template< class ... Parameters >
+struct ViewAllocProp ;
+
+template<>
+struct ViewAllocProp<> {
+
+  struct NullSpace {};
+
+  typedef std::false_type  allow_padding_t ;
+  typedef std::true_type   initialize_t ;
+  typedef NullSpace        memory_space ;
+  typedef NullSpace        execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp()
+    : label()
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+
+  ViewAllocProp( const std::string & arg_label )
+    : label( arg_label )
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ... Parameters >
+struct ViewAllocProp< const char * , Parameters ... >
+{
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef typename base_prop_type::memory_space     memory_space ;
+  typedef typename base_prop_type::execution_space  execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const char * const arg_label , Parameters ... arg_param )
+    : label( arg_label )
+    , memory( base_prop_type( arg_param ... ).memory )
+    , execution( base_prop_type( arg_param ... ).execution )
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ... Parameters >
+struct ViewAllocProp< std::string , Parameters ... >
+{
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef typename base_prop_type::memory_space     memory_space ;
+  typedef typename base_prop_type::execution_space  execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const std::string & arg_label , Parameters ... arg_param )
+    : label( arg_label )
+    , memory( base_prop_type( arg_param ... ).memory )
+    , execution( base_prop_type( arg_param ... ).execution )
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ... Parameters >
+struct ViewAllocProp< WithoutInitializing_t , Parameters ... >
+{
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef std::false_type                           initialize_t ;
+  typedef typename base_prop_type::memory_space     memory_space ;
+  typedef typename base_prop_type::execution_space  execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const WithoutInitializing_t & , Parameters ... arg_param )
+    : label( base_prop_type( arg_param ... ).label )
+    , memory( base_prop_type( arg_param ... ).memory )
+    , execution( base_prop_type( arg_param ... ).execution )
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ... Parameters >
+struct ViewAllocProp< AllowPadding_t , Parameters ... >
+{
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef std::true_type                            allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef typename base_prop_type::memory_space     memory_space ;
+  typedef typename base_prop_type::execution_space  execution_space ;
+
+  const std::string label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const AllowPadding_t & , Parameters ... arg_param )
+    : label( base_prop_type( arg_param ... ).label )
+    , memory( base_prop_type( arg_param ... ).memory )
+    , execution( base_prop_type( arg_param ... ).execution )
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class Space , class ... Parameters >
+struct ViewAllocProp< Space , Parameters ... >
+{
+  enum { is_exec = Kokkos::Impl::is_execution_space< Space >::value };
+  enum { is_mem  = Kokkos::Impl::is_memory_space< Space >::value };
+
+  static_assert( is_exec || is_mem , "View allocation given unknown parameter" );
+
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef typename std::conditional< is_mem  , Space , typename base_prop_type::memory_space >::type     memory_space ;
+  typedef typename std::conditional< is_exec , Space , typename base_prop_type::execution_space >::type  execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  // Templated so that 'base_prop_type( args ... ).execution'
+  // is not used unless arg_space == memory_space.
+  template< class ... Args >
+  ViewAllocProp( const memory_space & arg_space , Args ... args )
+    : label( base_prop_type( args ... ).label )
+    , memory( arg_space )
+    , execution( base_prop_type( args ... ).execution )
+    , allow_padding()
+    , initialize()
+    {}
+
+  // Templated so that 'base_prop_type( args ... ).memory'
+  // is not used unless arg_space == execution_space.
+  template< class ... Args >
+  ViewAllocProp( const execution_space & arg_space , Args ... args )
+    : label( base_prop_type( args ... ).label )
+    , memory( base_prop_type( args ... ).memory )
+    , execution( arg_space )
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ExecSpace , class MemSpace >
+struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace > , std::string >
+{
+  typedef ViewAllocProp<>  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef MemSpace   memory_space ;
+  typedef ExecSpace  execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const std::string & arg_label )
+    : label( arg_label )
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ExecSpace , class MemSpace , unsigned N >
+struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace > , char[N] >
+{
+  typedef ViewAllocProp<>  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef MemSpace   memory_space ;
+  typedef ExecSpace  execution_space  ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const char * const arg_label )
+    : label( arg_label )
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+
+// Deprecate in favor of view_alloc( Kokkos::WithoutInitializing )
+template< class ExecSpace , class MemSpace >
+struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace >
+                    , Kokkos::ViewAllocateWithoutInitializing
+                    >
+{
+  typedef ViewAllocProp<>  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef std::false_type                           initialize_t ;
+  typedef MemSpace   memory_space ;
+  typedef ExecSpace  execution_space  ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const Kokkos::ViewAllocateWithoutInitializing & arg )
+    : label( arg.label )
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ExecSpace , class MemSpace , class ... Parameters >
+struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace >
+                    , ViewAllocProp< Parameters ... >
+                    >
+{
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef MemSpace  memory_space ;
+
+  typedef
+    typename std::conditional
+      < Kokkos::Impl::is_execution_space< typename base_prop_type::execution_space >::value
+      , typename base_prop_type::execution_space
+      , ExecSpace
+      >::type  execution_space ;
+
+  static_assert( std::is_same< typename base_prop_type::memory_space , ViewAllocProp<>::NullSpace >::value ||
+                 std::is_same< typename base_prop_type::memory_space , memory_space >::value
+               , "View allocation given incompatible memory space" );
+
+  static_assert( Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename execution_space::memory_space
+                                                                  , memory_space >::value
+               , "View allocation given incompatible execution space" );
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  // If the input properties have a memory or execution space then copy construct those spaces
+  // otherwise default construct those spaces.
+
+  template< class P >
+  ViewAllocProp( const P & arg_prop
+               , typename std::enable_if
+                   < std::is_same< P , base_prop_type >::value &&
+                     Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
+                     Kokkos::Impl::is_execution_space< typename P::memory_space >::value
+                   >::type * = 0 )
+    : label( arg_prop.label )
+    , memory( arg_prop.memory )
+    , execution( arg_prop.execution )
+    , allow_padding()
+    , initialize()
+    {}
+
+  template< class P >
+  ViewAllocProp( const P & arg_prop
+               , typename std::enable_if
+                   < std::is_same< P , base_prop_type >::value &&
+                     Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
+                     ! Kokkos::Impl::is_execution_space< typename P::execution_space >::value
+                   >::type * = 0 )
+    : label( arg_prop.label )
+    , memory( arg_prop.memory )
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+
+  template< class P >
+  ViewAllocProp( const P & arg_prop
+               , typename std::enable_if
+                   < std::is_same< P , base_prop_type >::value &&
+                     ! Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
+                     Kokkos::Impl::is_execution_space< typename P::execution_space >::value
+                   >::type * = 0 )
+    : label( arg_prop.label )
+    , memory()
+    , execution( arg_prop.execution )
+    , allow_padding()
+    , initialize()
+    {}
+
+  template< class P >
+  ViewAllocProp( const P & arg_prop
+               , typename std::enable_if
+                   < std::is_same< P , base_prop_type >::value &&
+                     ! Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
+                     ! Kokkos::Impl::is_execution_space< typename P::execution_space >::value
+                   >::type * = 0 )
+    : label( arg_prop.label )
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
--- a/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
--- a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
@ -0,0 +1,844 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+#include <Kokkos_Atomic.hpp>
+
+#include <impl/Kokkos_Singleton.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <algorithm>
+#include <utility>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <iomanip>
+
+/* Enable clean up of memory leaks */
+#define CLEAN_UP_MEMORY_LEAKS 0
+
+namespace Kokkos { namespace Impl {
+
+namespace {
+
+
+//-----------------------------------------------------------------------------
+// AllocationRecord
+//-----------------------------------------------------------------------------
+//
+// Used to track details about an allocation and provide a ref count
+// sizeof(AllocationRecord) == 128
+struct AllocationRecord
+{
+  enum {
+     OFFSET = sizeof(AllocatorBase*)          // allocator
+            + sizeof(void*)                   // alloc_ptr
+            + sizeof(uint64_t)                // alloc_size
+            + sizeof(AllocatorAttributeBase*) // attribute
+            + sizeof(uint32_t)                // node_index
+            + sizeof(uint32_t)                // ref_count
+   , LABEL_LENGTH = 128 - OFFSET
+  };
+
+  AllocatorBase * const          allocator;
+  void * const                   alloc_ptr;
+  const uint64_t                 alloc_size;
+  AllocatorAttributeBase * const attribute;
+  const int32_t                  node_index;
+  volatile uint32_t              ref_count;
+  const char                     label[LABEL_LENGTH];
+
+
+  AllocationRecord(  AllocatorBase * const arg_allocator
+                   , void *   arg_alloc_ptr
+                   , uint64_t arg_alloc_size
+                   , int32_t  arg_node_index
+                   , const std::string & arg_label
+                  )
+    : allocator(arg_allocator)
+    , alloc_ptr(arg_alloc_ptr)
+    , alloc_size(arg_alloc_size)
+    , attribute(NULL)
+    , node_index(arg_node_index)
+    , ref_count(1)
+    , label() // zero fill
+  {
+    const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size();
+    strncpy( const_cast<char *>(label), arg_label.c_str(), length );
+  }
+
+  ~AllocationRecord()
+  {
+    if (attribute) {
+      delete attribute;
+    }
+  }
+
+  uint32_t increment_ref_count()
+  {
+    uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) );
+    return old_value + 1u;
+  }
+
+  uint32_t decrement_ref_count()
+  {
+    uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) );
+    return old_value - 1u;
+  }
+
+  void print( std::ostream & oss ) const
+  {
+    oss << "{ " << allocator->name()
+        << " } : \"" << label
+        << "\" ref_count(" << ref_count
+        << ") memory[ " << alloc_ptr
+        << " + " << alloc_size
+        << " ]" ;
+  }
+
+  bool set_attribute( AllocatorAttributeBase * attr )
+  {
+    bool result = false;
+    if (attribute == NULL) {
+      result = NULL == atomic_compare_exchange(  const_cast<AllocatorAttributeBase **>(&attribute)
+                                               , reinterpret_cast<AllocatorAttributeBase *>(NULL)
+                                               , attr );
+    }
+
+    return result;
+  }
+
+  // disallow copy and assignment
+  AllocationRecord( const AllocationRecord & );
+  AllocationRecord & operator=(const AllocationRecord &);
+};
+
+template <int NumBlocks>
+struct Bitset
+{
+  enum { blocks = NumBlocks };
+  enum { size = blocks * 64 };
+  enum { block_mask = 63u };
+  enum { block_shift = 6 };
+
+  // used to find free bits in a bitset
+  static int count_trailing_zeros(uint64_t x)
+  {
+    #if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC )
+      return x ? __builtin_ctzll(x) : 64;
+    #elif defined( KOKKOS_COMPILER_INTEL )
+      enum { shift = 32 };
+      enum { mask = (static_cast<uint64_t>(1) << shift) - 1u };
+      return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) :
+             (x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) :
+             64 ;
+    #elif defined( KOKKOS_COMPILER_IBM )
+      return x ? __cnttz8(x) : 64;
+    #else
+      int i = 0;
+      for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {}
+      return i;
+    #endif
+  }
+
+  Bitset()
+    : m_bits()
+  {
+    for (int i=0; i < blocks; ++i) {
+      m_bits[i] = 0u;
+    }
+  }
+
+  bool set( int i )
+  {
+    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
+    return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit );
+  }
+
+  bool reset( int i )
+  {
+    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
+    return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit;
+  }
+
+  bool test( int i )
+  {
+    const uint64_t block = m_bits[ i >> block_shift ];
+    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
+    return block & bit;
+  }
+
+  int find_first_unset() const
+  {
+    for (int i=0; i < blocks; ++i) {
+      const uint64_t block = m_bits[i];
+      int b = count_trailing_zeros( ~block );
+
+      if ( b < 64 ) {
+        return (i << block_shift) + b;
+      }
+    }
+    return size;
+  }
+
+  volatile uint64_t m_bits[blocks];
+};
+
+//-----------------------------------------------------------------------------
+// AllocationRecordPool -- singleton class
+//
+// global_alloc_rec_pool is the ONLY instance of this class
+//
+//-----------------------------------------------------------------------------
+// Record AllocationRecords in a lock-free circular list.
+// Each node in the list has a buffer with space for 959 ((15*64)-1) records
+// managed by a bitset.  Atomics are used to set and reset bits in the bit set.
+// The head of the list is atomically updated to the last node found with
+// unused space.
+//
+// Cost time to create an allocation record: amortized O(1), worst case O(num nodes)
+// Cost to destroy an allocation recored: O(1)
+//
+// Singleton allocations are pushed onto a lock-free stack that is destroyed
+// after the circular list of allocation records.
+struct AllocationRecordPool
+{
+  enum { BITSET_BLOCKS = 15 };
+
+  typedef Bitset<BITSET_BLOCKS> bitset_type;
+
+  enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) };
+
+  struct AllocationNode
+  {
+    AllocationNode()
+      : next()
+      , bitset()
+      , buffer()
+    {
+      // set the first bit to used
+      bitset.set(0);
+    }
+
+    void * get_buffer( int32_t node_index )
+    {
+      return buffer + (node_index-1) * sizeof(AllocationRecord);
+    }
+
+    // return 0 if no space is available in the node
+    int32_t get_node_index()
+    {
+      int32_t node_index = 0;
+      do {
+        node_index = bitset.find_first_unset();
+
+        // successfully claimed a bit
+        if ( node_index != bitset.size && bitset.set(node_index) )
+        {
+          return node_index;
+        }
+      } while ( node_index != bitset.size );
+      return 0;
+    }
+
+    void clear_node_index( int32_t node_index )
+    {
+      bitset.reset(node_index);
+    }
+
+    AllocationNode * next;
+    bitset_type      bitset;
+    char             buffer[BUFFER_SIZE];
+  };
+
+  struct SingletonNode
+  {
+    void * buffer;
+    SingletonNode * next;
+    Impl::singleton_destroy_function_type destroy;
+
+    SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func  )
+      : buffer(NULL)
+      , next(NULL)
+      , destroy(destroy_func)
+    {
+      if (size) {
+        buffer = malloc(size);
+        create_func(buffer);
+      }
+    }
+
+    ~SingletonNode()
+    {
+      if (buffer) {
+        try {
+          destroy(buffer);
+        } catch(...) {}
+        free(buffer);
+      }
+    }
+  };
+
+  AllocationRecordPool()
+    : head( new AllocationNode() )
+    , singleton_head(NULL)
+  {
+    // setup ring
+    head->next = head;
+  }
+
+  ~AllocationRecordPool()
+  {
+    // delete allocation records
+    {
+      AllocationNode * start = head;
+
+      AllocationNode * curr = start;
+
+      std::vector< std::string > string_vec;
+
+      do {
+        AllocationNode * next = curr->next;
+
+        #if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET )
+        // print node bitset
+        for (int i=0; i < bitset_type::blocks; ++i ) {
+          std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << "   ";
+        }
+        std::cout << std::endl;
+        #endif
+
+        // bit zero does not map to an AllocationRecord
+        for ( int32_t i=1; i < bitset_type::size; ++i )
+        {
+          if (curr->bitset.test(i)) {
+            AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
+
+            std::ostringstream oss;
+            alloc_rec->print( oss );
+            string_vec.push_back( oss.str() );
+
+#if CLEAN_UP_MEMORY_LEAKS
+/* Cleaning up memory leaks prevents memory error detection tools
+ * from reporting the original source of allocation, which can
+ * impede debugging with such tools.
+ */
+            try {
+              destroy(alloc_rec);
+            }
+            catch(...) {}
+#endif
+          }
+        }
+
+        curr->next = NULL;
+
+        delete curr;
+
+        curr = next;
+      } while ( curr != start );
+
+      if ( !string_vec.empty() ) {
+        std::sort( string_vec.begin(), string_vec.end() );
+
+        std::ostringstream oss;
+        oss << "Error: Allocation pool destroyed with the following memory leak(s):\n";
+        for (size_t i=0; i< string_vec.size(); ++i)
+        {
+          oss << "   " << string_vec[i] << std::endl;
+        }
+
+        std::cerr << oss.str() << std::endl;
+      }
+    }
+
+    // delete singletons
+    {
+      SingletonNode * curr = singleton_head;
+
+      while (curr) {
+        SingletonNode * next = curr->next;
+        delete curr;
+        curr = next;
+      }
+    }
+  }
+
+  AllocationRecord * create(  AllocatorBase * arg_allocator
+                            , void * arg_alloc_ptr
+                            , size_t arg_alloc_size
+                            , const std::string & arg_label
+                           )
+  {
+    AllocationNode * start = volatile_load(&head);
+
+    AllocationNode * curr = start;
+
+
+    int32_t node_index = curr->get_node_index();
+
+    if (node_index == 0) {
+      curr = volatile_load(&curr->next);
+    }
+
+    while (node_index == 0 && curr != start)
+    {
+      node_index = curr->get_node_index();
+      if (node_index == 0) {
+        curr = volatile_load(&curr->next);
+      }
+    }
+
+    // Need to allocate and insert a new node
+    if (node_index == 0 && curr == start)
+    {
+      AllocationNode * new_node = new AllocationNode();
+
+      node_index = new_node->get_node_index();
+
+      AllocationNode * next = NULL;
+      do {
+        next = volatile_load(&curr->next);
+        new_node->next = next;
+        memory_fence();
+      } while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) );
+
+      curr = new_node;
+    }
+
+    void * buffer = curr->get_buffer(node_index);
+
+    // try to set head to curr
+    if ( start != curr )
+    {
+      atomic_compare_exchange( & head, start, curr );
+    }
+
+    return new (buffer) AllocationRecord(  arg_allocator
+                                         , arg_alloc_ptr
+                                         , arg_alloc_size
+                                         , node_index
+                                         , arg_label
+                                        );
+  }
+
+  void destroy( AllocationRecord * alloc_rec )
+  {
+    if (alloc_rec) {
+      const int32_t node_index = alloc_rec->node_index;
+      AllocationNode * node = get_node( alloc_rec );
+
+      // deallocate memory
+      alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size );
+
+      // call destructor
+      alloc_rec->~AllocationRecord();
+
+      // wait for writes to complete
+      memory_fence();
+
+      // clear node index
+      node->clear_node_index( node_index );
+    }
+  }
+
+  void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
+  {
+    SingletonNode * node = new SingletonNode( size, create_func, destroy_func );
+    SingletonNode * next;
+
+    // insert new node at the head of the list
+    do {
+      next = volatile_load(&singleton_head);
+      node->next = next;
+    } while ( next != atomic_compare_exchange( &singleton_head, next, node ) );
+
+    return node->buffer;
+  }
+
+  void print_memory( std::ostream & out ) const
+  {
+    AllocationNode * start = head;
+
+    AllocationNode * curr = start;
+
+    std::vector< std::string > string_vec;
+
+    do {
+      AllocationNode * next = curr->next;
+
+      // bit zero does not map to an AllocationRecord
+      for ( int32_t i=1; i < bitset_type::size; ++i )
+      {
+        if (curr->bitset.test(i)) {
+          AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
+
+          std::ostringstream oss;
+          alloc_rec->print( oss );
+          string_vec.push_back( oss.str() );
+        }
+      }
+      curr = next;
+    } while ( curr != start );
+
+    if ( !string_vec.empty() ) {
+      std::sort( string_vec.begin(), string_vec.end() );
+
+      std::ostringstream oss;
+      oss << "Tracked Memory:" << std::endl;
+      for (size_t i=0; i< string_vec.size(); ++i)
+      {
+        oss << "   " << string_vec[i] << std::endl;
+      }
+      out << oss.str() << std::endl;
+    }
+    else {
+      out << "No Tracked Memory" << std::endl;
+    }
+  }
+
+  // find an AllocationRecord such that
+  // alloc_ptr <= ptr < alloc_ptr + alloc_size
+  // otherwise return NULL
+  AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const
+  {
+    AllocationNode * start = head;
+
+    AllocationNode * curr = start;
+
+    char const * const char_ptr = reinterpret_cast<const char *>(ptr);
+
+    do {
+      AllocationNode * next = curr->next;
+
+      // bit zero does not map to an AllocationRecord
+      for ( int32_t i=1; i < bitset_type::size; ++i )
+      {
+        if (curr->bitset.test(i)) {
+          AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
+
+          char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr);
+
+          if (   (allocator == alloc_rec->allocator)
+              && (alloc_ptr <= char_ptr)
+              && (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) )
+          {
+            return alloc_rec;
+          }
+        }
+      }
+      curr = next;
+    } while ( curr != start );
+
+    return NULL;
+  }
+
+private:
+
+  AllocationNode * get_node( AllocationRecord * alloc_rec )
+  {
+    return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index);
+  }
+
+  AllocationNode * head;
+  SingletonNode * singleton_head;
+};
+
+// create the global pool for allocation records
+AllocationRecordPool global_alloc_rec_pool;
+
+
+
+// convert a uintptr_t to an AllocationRecord pointer
+inline
+AllocationRecord * to_alloc_rec( uintptr_t alloc_rec )
+{
+  return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) );
+}
+
+} // unnamed namespace
+
+//-----------------------------------------------------------------------------
+// Allocation Tracker methods
+//-----------------------------------------------------------------------------
+
+// Create a reference counted AllocationTracker
+void AllocationTracker::initalize(  AllocatorBase * arg_allocator
+                                  , void * arg_alloc_ptr
+                                  , size_t arg_alloc_size
+                                  , const std::string & arg_label
+                                 )
+{
+  if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) {
+    // create record
+    AllocationRecord * alloc_rec = global_alloc_rec_pool.create(  arg_allocator
+                                                                , arg_alloc_ptr
+                                                                , arg_alloc_size
+                                                                , arg_label
+                                                               );
+
+    m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
+  }
+}
+
+void AllocationTracker::reallocate( size_t size ) const
+{
+  AllocationRecord * rec = to_alloc_rec( m_alloc_rec );
+
+  void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size );
+
+  if ( NULL != the_alloc_ptr )
+  {
+    *const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr;
+    *const_cast<uint64_t *>(&rec->alloc_size) = size;
+  }
+  else {
+    Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker");
+  }
+}
+
+
+void AllocationTracker::increment_ref_count() const
+{
+  to_alloc_rec( m_alloc_rec )->increment_ref_count();
+}
+
+
+void AllocationTracker::decrement_ref_count() const
+{
+  AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec );
+  uint32_t the_ref_count = alloc_rec->decrement_ref_count();
+  if (the_ref_count == 0u) {
+    try {
+      global_alloc_rec_pool.destroy( alloc_rec );
+    }
+    catch(...) {}
+  }
+}
+
+namespace {
+
+struct NullAllocator { static const char * name() { return "Null Allocator"; } };
+
+}
+
+AllocatorBase * AllocationTracker::allocator() const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->allocator;
+  }
+  return Allocator<NullAllocator>::singleton();
+}
+
+void * AllocationTracker::alloc_ptr()  const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->alloc_ptr;
+  }
+  return NULL;
+}
+
+size_t AllocationTracker::alloc_size() const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->alloc_size;
+  }
+  return 0u;
+}
+
+size_t AllocationTracker::ref_count()  const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->ref_count;
+  }
+  return 0u;
+}
+
+char const * AllocationTracker::label() const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->label;
+  }
+  return "[Empty Allocation Tracker]";
+}
+
+void AllocationTracker::print( std::ostream & oss) const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    to_alloc_rec(m_alloc_rec)->print(oss);
+  }
+  else {
+    oss << label();
+  }
+}
+
+bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const
+{
+  bool result = false;
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    result = to_alloc_rec(m_alloc_rec)->set_attribute(attr);
+  }
+  return result;
+}
+
+AllocatorAttributeBase * AllocationTracker::attribute() const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->attribute;
+  }
+  return NULL;
+}
+
+void AllocationTracker::print_tracked_memory( std::ostream & out )
+{
+  global_alloc_rec_pool.print_memory( out );
+}
+
+
+AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator )
+{
+  AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator);
+
+  AllocationTracker tracker;
+
+  if ( alloc_rec != NULL )
+  {
+    if ( tracking_enabled() ) {
+      alloc_rec->increment_ref_count();
+      tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
+    }
+    else {
+      tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec);
+    }
+  }
+
+  return tracker ;
+}
+
+
+
+//-----------------------------------------------------------------------------
+// static AllocationTracker
+//-----------------------------------------------------------------------------
+#if defined( KOKKOS_USE_DECENTRALIZED_HOST )
+namespace {
+
+  // TODO : Detect compiler support for thread local variables
+  #if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+    bool g_thread_local_tracking_enabled = true;
+    #pragma omp threadprivate(g_thread_local_tracking_enabled)
+  #elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+    __thread bool g_thread_local_tracking_enabled = true;
+  #elif defined( KOKKOS_HAVE_OPENMP )
+    bool g_thread_local_tracking_enabled = true;
+    #pragma omp threadprivate(g_thread_local_tracking_enabled)
+  #elif defined( KOKKOS_HAVE_PTHREAD )
+    __thread bool g_thread_local_tracking_enabled = true;
+  #elif defined( KOKKOS_HAVE_SERIAL )
+      bool g_thread_local_tracking_enabled = true;
+  #endif
+} // unnamed namespace
+
+void AllocationTracker::disable_tracking()
+{
+  g_thread_local_tracking_enabled = false;
+}
+
+void AllocationTracker::enable_tracking()
+{
+  g_thread_local_tracking_enabled = true;
+}
+
+bool AllocationTracker::tracking_enabled()
+{
+  return g_thread_local_tracking_enabled;
+}
+#else
+namespace {
+enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED };
+volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED;
+}
+
+void AllocationTracker::disable_tracking()
+{
+  if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) {
+    Impl::throw_runtime_exception("Error: Tracking already disabled");
+  }
+}
+
+void AllocationTracker::enable_tracking()
+{
+  if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) {
+    Impl::throw_runtime_exception("Error: Tracking already enabled");
+  }
+}
+
+bool AllocationTracker::tracking_enabled()
+{
+  return g_tracking_enabled == TRACKING_ENABLED;
+}
+#endif
+
+
+//-----------------------------------------------------------------------------
+// create singleton free function
+//-----------------------------------------------------------------------------
+void * create_singleton(  size_t size
+                        , Impl::singleton_create_function_type create_func
+                        , Impl::singleton_destroy_function_type destroy_func )
+{
+  return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func );
+}
+
+}} // namespace Kokkos::Impl
+
+#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
+
--- a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
@ -0,0 +1,586 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ALLOCATION_TRACKER_HPP
+#define KOKKOS_ALLOCATION_TRACKER_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#include <stdint.h>
+#include <cstdlib>
+#include <string>
+#include <iosfwd>
+
+namespace Kokkos { namespace Impl {
+
+//-----------------------------------------------------------------------------
+// Create Singleton objects
+//-----------------------------------------------------------------------------
+
+typedef void * (*singleton_create_function_type)(void * buffer);
+typedef void (*singleton_destroy_function_type)(void *);
+
+void * create_singleton(  size_t size
+                        , singleton_create_function_type create_func
+                        , singleton_destroy_function_type destroy_func
+                       );
+
+
+
+/// class Singleton
+///
+/// Default construct a singleton type.  This method is used to circumvent
+/// order of construction issues.  Singleton objects are destroyed after all
+/// other allocations in the reverse order of their creation.
+template <typename Type>
+class Singleton
+{
+public:
+  /// Get a pointer to the Singleton. Default construct the singleton if it does not already exist
+  static Type * get()
+  {
+    static Type * singleton = NULL;
+    if (singleton == NULL) {
+      Impl::singleton_create_function_type  create_func = &create;
+      Impl::singleton_destroy_function_type destroy_func = &destroy;
+      singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) );
+    }
+    return singleton;
+  }
+
+private:
+
+  /// Call the Type constructor
+  static void destroy(void * ptr)
+  {
+    reinterpret_cast<Type*>(ptr)->~Type();
+  }
+
+  /// placement new the Type in buffer
+  static void * create(void * buffer)
+  {
+    return new (buffer) Type();
+  }
+};
+
+
+//-----------------------------------------------------------------------------
+// AllocatorBase
+//-----------------------------------------------------------------------------
+
+/// class AllocatorBase
+///
+/// Abstract base class for all Allocators.
+/// Allocators should be singleton objects, use Singleton<Allocator>::get to create
+/// to avoid order of destruction issues
+class AllocatorBase
+{
+public:
+  /// name of the allocator
+  /// used to report memory leaks
+  virtual const char * name() const = 0;
+
+  /// Allocate a buffer of size number of bytes
+  virtual void* allocate(size_t size) const = 0;
+
+  /// Deallocate a buffer with size number of bytes
+  /// The pointer must have been allocated with a call to corresponding allocate
+  virtual void deallocate(void * ptr, size_t size) const = 0;
+
+  /// Changes the size of the memory block pointed to by ptr.
+  /// Ptr must have been allocated with the corresponding allocate call
+  /// The function may move the memory block to a new location
+  /// (whose address is returned by the function).
+  ///
+  /// The content of the memory block is preserved up to the lesser of the new and
+  /// old sizes, even if the block is moved to a new location. If the new size is larger,
+  /// the value of the newly allocated portion is indeterminate.
+  ///
+  /// In case that ptr is a null pointer, the function behaves like allocate, assigning a
+  /// new block of size bytes and returning a pointer to its beginning.
+  virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0;
+
+  /// can a texture object be bound to the allocated memory
+  virtual bool support_texture_binding() const = 0;
+
+  /// virtual destructor
+  virtual ~AllocatorBase() {}
+};
+
+/// class AllocatorAttributeBase
+class AllocatorAttributeBase
+{
+public:
+  virtual ~AllocatorAttributeBase() {}
+};
+
+//-----------------------------------------------------------------------------
+// Allocator< StaticAllocator > : public AllocatorBase
+//-----------------------------------------------------------------------------
+
+// HasStaticName
+template<typename T>
+class HasStaticName
+{
+  typedef const char * (*static_method)();
+  template<typename U, static_method> struct SFINAE {};
+  template<typename U> static char Test(SFINAE<U, &U::name>*);
+  template<typename U> static int Test(...);
+public:
+  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
+};
+
+
+template <typename T>
+inline
+typename enable_if<HasStaticName<T>::value, const char *>::type
+allocator_name()
+{
+  return T::name();
+}
+
+template <typename T>
+inline
+typename enable_if<!HasStaticName<T>::value, const char *>::type
+allocator_name()
+{
+  return "Unnamed Allocator";
+}
+
+
+// HasStaticAllocate
+template<typename T>
+class HasStaticAllocate
+{
+  typedef void * (*static_method)(size_t);
+  template<typename U, static_method> struct SFINAE {};
+  template<typename U> static char Test(SFINAE<U, &U::allocate>*);
+  template<typename U> static int Test(...);
+public:
+  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+inline
+typename enable_if<HasStaticAllocate<T>::value, void *>::type
+allocator_allocate(size_t size)
+{
+  return T::allocate(size);
+}
+
+template <typename T>
+inline
+typename enable_if<!HasStaticAllocate<T>::value, void *>::type
+allocator_allocate(size_t)
+{
+  throw_runtime_exception(  std::string("Error: ")
+                          + std::string(allocator_name<T>())
+                          + std::string(" cannot allocate memory!") );
+  return NULL;
+}
+
+// HasStaticDeallocate
+template<typename T>
+class HasStaticDeallocate
+{
+  typedef void (*static_method)(void *, size_t);
+  template<typename U, static_method> struct SFINAE {};
+  template<typename U> static char Test(SFINAE<U, &U::deallocate>*);
+  template<typename U> static int Test(...);
+public:
+  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+inline
+typename enable_if<HasStaticDeallocate<T>::value, void>::type
+allocator_deallocate(void * ptr, size_t size)
+{
+  T::deallocate(ptr,size);
+}
+
+template <typename T>
+inline
+typename enable_if<!HasStaticDeallocate<T>::value, void>::type
+allocator_deallocate(void *, size_t)
+{
+  throw_runtime_exception(  std::string("Error: ")
+                          + std::string(allocator_name<T>())
+                          + std::string(" cannot deallocate memory!") );
+}
+
+// HasStaticReallocate
+template<typename T>
+class HasStaticReallocate
+{
+  typedef void * (*static_method)(void *, size_t, size_t);
+  template<typename U, static_method> struct SFINAE {};
+  template<typename U> static char Test(SFINAE<U, &U::reallocate>*);
+  template<typename U> static int Test(...);
+public:
+  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+inline
+typename enable_if<HasStaticReallocate<T>::value, void *>::type
+allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  return T::reallocate(old_ptr, old_size, new_size);
+}
+
+template <typename T>
+inline
+typename enable_if<!HasStaticReallocate<T>::value, void *>::type
+allocator_reallocate(void *, size_t, size_t)
+{
+  throw_runtime_exception(  std::string("Error: ")
+                          + std::string(allocator_name<T>())
+                          + std::string(" cannot reallocate memory!") );
+  return NULL;
+}
+
+// HasStaticReallocate
+template<typename T>
+class HasStaticSupportTextureBinding
+{
+  typedef bool (*static_method)();
+  template<typename U, static_method> struct SFINAE {};
+  template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*);
+  template<typename U> static int Test(...);
+public:
+  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+inline
+typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type
+allocator_support_texture_binding()
+{
+  return T::support_texture_binding();
+}
+
+template <typename T>
+inline
+typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type
+allocator_support_texture_binding()
+{
+  return false;
+}
+
+template <typename T>
+class Allocator : public AllocatorBase
+{
+public:
+  virtual const char * name() const
+  {
+    return allocator_name<T>();
+  }
+
+  virtual void* allocate(size_t size) const
+  {
+    return allocator_allocate<T>(size);
+  }
+
+  virtual void deallocate(void * ptr, size_t size) const
+  {
+    allocator_deallocate<T>(ptr,size);
+  }
+
+  virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const
+  {
+    return allocator_reallocate<T>(old_ptr, old_size, new_size);
+  }
+
+  virtual bool support_texture_binding() const
+  {
+    return allocator_support_texture_binding<T>();
+  }
+
+  static AllocatorBase * singleton()
+  {
+    return Singleton< Allocator<T> >::get();
+  }
+};
+
+//-----------------------------------------------------------------------------
+// AllocationTracker
+//-----------------------------------------------------------------------------
+
+// forward declaration for friend classes
+struct CopyWithoutTracking;
+struct MallocHelper;
+
+/// class AllocationTracker
+/// Will call deallocate from the AllocatorBase when the reference count reaches 0.
+/// Reference counting is disabled when the host is in parallel.
+class AllocationTracker
+{
+  // use the least significant bit of the AllocationRecord pointer to indicate if the
+  // AllocationTracker should reference count
+  enum {
+     REF_COUNT_BIT = static_cast<uintptr_t>(1)
+   , REF_COUNT_MASK = ~static_cast<uintptr_t>(1)
+  };
+
+public:
+
+  /// Find an AllocationTracker such that
+  /// alloc_ptr <= ptr < alloc_ptr + alloc_size
+  /// O(n) where n is the number of tracked allocations.
+  template <typename StaticAllocator>
+  static AllocationTracker find( void const * ptr )
+  {
+    return find( ptr, Allocator<StaticAllocator>::singleton() );
+  }
+
+
+  /// Pretty print all the currently tracked memory
+  static void print_tracked_memory( std::ostream & out );
+
+  /// Default constructor
+  KOKKOS_INLINE_FUNCTION
+  AllocationTracker()
+    : m_alloc_rec(0)
+  {}
+
+  /// Create a AllocationTracker
+  ///
+  /// Start reference counting the alloc_ptr.
+  /// When the reference count reachs 0 the allocator deallocate method
+  /// will be call with the given size.  The alloc_ptr should have been
+  /// allocated with the allocator's allocate method.
+  ///
+  /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
+  /// do nothing
+  template <typename StaticAllocator>
+  AllocationTracker(  StaticAllocator const &
+                    , void * arg_alloc_ptr
+                    , size_t arg_alloc_size
+                    , const std::string & arg_label = std::string("") )
+    : m_alloc_rec(0)
+  {
+    AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
+    initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
+  }
+
+  /// Create a AllocationTracker
+  ///
+  /// Start reference counting the alloc_ptr.
+  /// When the reference count reachs 0 the allocator deallocate method
+  /// will be call with the given size.  The alloc_ptr should have been
+  /// allocated with the allocator's allocate method.
+  ///
+  /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
+  /// do nothing
+  template <typename StaticAllocator>
+  AllocationTracker(  StaticAllocator const &
+                    , size_t arg_alloc_size
+                    , const std::string & arg_label = std::string("")
+                   )
+    : m_alloc_rec(0)
+  {
+    AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
+    void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size );
+
+    initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
+  }
+
+  /// Copy an AllocatorTracker
+  KOKKOS_INLINE_FUNCTION
+  AllocationTracker( const AllocationTracker & rhs )
+    : m_alloc_rec( rhs.m_alloc_rec)
+  {
+#if !defined( __CUDA_ARCH__ )
+    if ( rhs.ref_counting() && tracking_enabled() ) {
+      increment_ref_count();
+    }
+    else {
+      m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
+    }
+#else
+    m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
+#endif
+  }
+
+  /// Copy an AllocatorTracker
+  /// Decrement the reference count of the current tracker if necessary
+  KOKKOS_INLINE_FUNCTION
+  AllocationTracker & operator=( const AllocationTracker & rhs )
+  {
+    if (this != &rhs) {
+#if !defined( __CUDA_ARCH__ )
+      if ( ref_counting() ) {
+        decrement_ref_count();
+      }
+
+      m_alloc_rec = rhs.m_alloc_rec;
+
+      if ( rhs.ref_counting() && tracking_enabled() ) {
+        increment_ref_count();
+      }
+      else {
+        m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
+      }
+#else
+      m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK;
+#endif
+    }
+
+    return * this;
+  }
+
+  /// Destructor
+  /// Decrement the reference count if necessary
+  KOKKOS_INLINE_FUNCTION
+  ~AllocationTracker()
+  {
+#if !defined( __CUDA_ARCH__ )
+    if ( ref_counting() ) {
+      decrement_ref_count();
+    }
+#endif
+  }
+
+  /// Is the tracker valid?
+  KOKKOS_INLINE_FUNCTION
+  bool is_valid() const
+  {
+    return (m_alloc_rec & REF_COUNT_MASK);
+  }
+
+
+
+  /// clear the tracker
+  KOKKOS_INLINE_FUNCTION
+  void clear()
+  {
+#if !defined( __CUDA_ARCH__ )
+    if ( ref_counting() ) {
+      decrement_ref_count();
+    }
+#endif
+    m_alloc_rec = 0;
+  }
+
+  /// is this tracker currently counting allocations?
+  KOKKOS_INLINE_FUNCTION
+  bool ref_counting() const
+  {
+    return (m_alloc_rec & REF_COUNT_BIT);
+  }
+
+  AllocatorBase * allocator() const;
+
+  /// pointer to the allocated memory
+  void * alloc_ptr()  const;
+
+  /// size in bytes of the allocated memory
+  size_t alloc_size() const;
+
+  /// the current reference count
+  size_t ref_count()  const;
+
+  /// the label given to the allocation
+  char const * label() const;
+
+  /// pretty print all the tracker's information to the std::ostream
+  void print( std::ostream & oss) const;
+
+
+  /// set an attribute ptr on the allocation record
+  /// the arg_attribute pointer will be deleted when the record is destroyed
+  /// the attribute ptr can only be set once
+  bool set_attribute( AllocatorAttributeBase * arg_attribute) const;
+
+  /// get the attribute ptr from the allocation record
+  AllocatorAttributeBase * attribute() const;
+
+
+  /// reallocate the memory tracked by this allocation
+  /// NOT thread-safe
+  void reallocate( size_t size ) const;
+
+private:
+
+  static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
+
+  void initalize(  AllocatorBase * arg_allocator
+                 , void * arg_alloc_ptr
+                 , size_t arg_alloc_size
+                 , std::string const & label );
+
+  void increment_ref_count() const;
+  void decrement_ref_count() const;
+
+  static void disable_tracking();
+  static void enable_tracking();
+  static bool tracking_enabled();
+
+  friend struct Impl::CopyWithoutTracking;
+  friend struct Impl::MallocHelper;
+
+  uintptr_t m_alloc_rec;
+};
+
+
+
+/// Make a copy of the functor with reference counting disabled
+struct CopyWithoutTracking
+{
+  template <typename Functor>
+  static Functor apply( const Functor & f )
+  {
+    AllocationTracker::disable_tracking();
+    Functor func(f);
+    AllocationTracker::enable_tracking();
+    return func;
+  }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_ALLOCATION_TRACKER_HPP
--- a/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
@ -0,0 +1,260 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ANALYZESHAPE_HPP
+#define KOKKOS_ANALYZESHAPE_HPP
+
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+/** \brief  Analyze the array shape defined by a Kokkos::View data type.
+ *
+ *  It is presumed that the data type can be mapped down to a multidimensional
+ *  array of an intrinsic scalar numerical type (double, float, int, ... ).
+ *  The 'value_type' of an array may be an embedded aggregate type such
+ *  as a fixed length array 'Array<T,N>'.
+ *  In this case the 'array_intrinsic_type' represents the
+ *  underlying array of intrinsic scalar numerical type.
+ *
+ *  The embedded aggregate type must have an AnalyzeShape specialization
+ *  to map it down to a shape and intrinsic scalar numerical type.
+ */
+template< class T >
+struct AnalyzeShape : public Shape< sizeof(T) , 0 >
+{
+  typedef void specialize ;
+
+  typedef Shape< sizeof(T), 0 >  shape ;
+
+  typedef       T  array_intrinsic_type ;
+  typedef       T  value_type ;
+  typedef       T  type ;
+
+  typedef const T  const_array_intrinsic_type ;
+  typedef const T  const_value_type ;
+  typedef const T  const_type ;
+
+  typedef       T  non_const_array_intrinsic_type ;
+  typedef       T  non_const_value_type ;
+  typedef       T  non_const_type ;
+};
+
+template<>
+struct AnalyzeShape<void> : public Shape< 0 , 0 >
+{
+  typedef void specialize ;
+
+  typedef Shape< 0 , 0 >  shape ;
+
+  typedef       void  array_intrinsic_type ;
+  typedef       void  value_type ;
+  typedef       void  type ;
+  typedef const void  const_array_intrinsic_type ;
+  typedef const void  const_value_type ;
+  typedef const void  const_type ;
+  typedef       void  non_const_array_intrinsic_type ;
+  typedef       void  non_const_value_type ;
+  typedef       void  non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< const T > : public AnalyzeShape<T>::shape
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename nested::shape shape ;
+
+  typedef typename nested::const_array_intrinsic_type  array_intrinsic_type ;
+  typedef typename nested::const_value_type            value_type ;
+  typedef typename nested::const_type                  type ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< T * >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_intrinsic_type * array_intrinsic_type ;
+  typedef typename nested::value_type             value_type ;
+  typedef typename nested::type                 * type ;
+
+  typedef typename nested::const_array_intrinsic_type * const_array_intrinsic_type ;
+  typedef typename nested::const_value_type             const_value_type ;
+  typedef typename nested::const_type                 * const_type ;
+
+  typedef typename nested::non_const_array_intrinsic_type * non_const_array_intrinsic_type ;
+  typedef typename nested::non_const_value_type             non_const_value_type ;
+  typedef typename nested::non_const_type                 * non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< T[] >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [] ;
+};
+
+template< class T >
+struct AnalyzeShape< const T[] >
+  : public ShapeInsert< typename AnalyzeShape< const T >::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape< const T > nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [] ;
+};
+
+template< class T , unsigned N >
+struct AnalyzeShape< T[N] >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , N >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [N] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [N] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [N] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [N] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [N] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [N] ;
+};
+
+template< class T , unsigned N >
+struct AnalyzeShape< const T[N] >
+  : public ShapeInsert< typename AnalyzeShape< const T >::shape , N >::type
+{
+private:
+  typedef AnalyzeShape< const T > nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [N] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [N] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [N] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [N] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [N] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [N] ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_ANALYZESHAPE_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp
@ -0,0 +1,214 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_X86_HPP )
+#define KOKKOS_ATOMIC_ASSEMBLY_X86_HPP
+namespace Kokkos {
+
+#ifdef KOKKOS_ENABLE_ASM
+#ifndef __CUDA_ARCH__
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<char>(volatile char* a) {
+  __asm__ __volatile__(
+    "lock incb %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<short>(volatile short* a) {
+  __asm__ __volatile__(
+    "lock incw %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<int>(volatile int* a) {
+  __asm__ __volatile__(
+    "lock incl %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<long long int>(volatile long long int* a) {
+  __asm__ __volatile__(
+    "lock incq %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<char>(volatile char* a) {
+  __asm__ __volatile__(
+    "lock decb %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<short>(volatile short* a) {
+  __asm__ __volatile__(
+    "lock decw %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<int>(volatile int* a) {
+  __asm__ __volatile__(
+    "lock decl %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<long long int>(volatile long long int* a) {
+  __asm__ __volatile__(
+    "lock decq %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+#endif
+#endif
+
+namespace Impl {
+  struct cas128_t
+  {
+    uint64_t lower;
+    uint64_t upper;
+
+    KOKKOS_INLINE_FUNCTION
+    cas128_t () {
+      lower = 0;
+      upper = 0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    cas128_t (const cas128_t& a) {
+      lower = a.lower;
+      upper = a.upper;
+    }
+    KOKKOS_INLINE_FUNCTION
+    cas128_t (volatile cas128_t* a) {
+      lower = a->lower;
+      upper = a->upper;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    bool operator != (const cas128_t& a) const {
+      return (lower != a.lower) || upper!=a.upper;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator = (const cas128_t& a) {
+      lower = a.lower;
+      upper = a.upper;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator = (const cas128_t& a) volatile {
+      lower = a.lower;
+      upper = a.upper;
+    }
+  }
+  __attribute__ (( __aligned__( 16 ) ));
+
+
+
+
+  inline cas128_t cas128( volatile cas128_t * ptr, cas128_t cmp,  cas128_t swap )
+  {
+    #ifdef KOKKOS_ENABLE_ASM
+    bool swapped = false;
+    __asm__ __volatile__
+    (
+     "lock cmpxchg16b %1\n\t"
+     "setz %0"
+     : "=q" ( swapped )
+     , "+m" ( *ptr )
+     , "+d" ( cmp.upper )
+     , "+a" ( cmp.lower )
+     : "c" ( swap.upper )
+     , "b" ( swap.lower )
+     , "q" ( swapped )
+    );
+    return cmp;
+    #else
+      cas128_t tmp(ptr);
+      if(tmp !=  cmp) {
+        return tmp;
+      } else {
+        *ptr = swap;
+        return swap;
+      }
+    #endif
+  }
+
+}
+}
+
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@ -0,0 +1,259 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
+#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+// Cuda native CAS supports int, unsigned int, and unsigned long long int (non-standard type).
+// Must cast-away 'volatile' for the CAS call.
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+__inline__ __device__
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return atomicCAS((int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val)
+{ return atomicCAS((unsigned int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned long long int atomic_compare_exchange( volatile unsigned long long int * const dest ,
+                                                const unsigned long long int compare ,
+                                                const unsigned long long int val )
+{ return atomicCAS((unsigned long long int*)dest,compare,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  const int tmp = atomicCAS( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  const type tmp = atomicCAS( (type*) dest , *((type*)&compare) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      if( return_val == compare )
+        *dest = val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+}
+
+//----------------------------------------------------------------------------
+// GCC native CAS supports int, long, unsigned int, unsigned long.
+// Intel native CAS support int and long with the same interface as GCC.
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+KOKKOS_INLINE_FUNCTION
+long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+// GCC supports unsigned
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long atomic_compare_exchange( volatile unsigned long * const dest ,
+                                       const unsigned long compare ,
+                                       const unsigned long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+#else
+  union U {
+    int i ;
+    T t ;
+  } tmp ;
+#endif
+
+  tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return tmp.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T & >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    long i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+#else
+  union U {
+    long i ;
+    T t ;
+  } tmp ;
+#endif
+
+  tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
+  return tmp.t ;
+}
+
+#ifdef KOKKOS_ENABLE_ASM
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(long) &&
+                                    sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+
+  tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
+  return tmp.t ;
+}
+#endif
+
+template < typename T >
+inline
+T atomic_compare_exchange( volatile T * const dest , const T compare ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+            #if defined(KOKKOS_ENABLE_ASM)
+               && ( sizeof(T) != 16 )
+            #endif
+             , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  if( return_val == compare ) {
+    const T tmp = *dest = val;
+    #ifndef KOKKOS_COMPILER_CLANG
+    (void) tmp;
+    #endif
+  }
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T compare, const T val )
+{
+  T retval;
+#pragma omp critical
+  {
+    retval = dest[0];
+    if ( retval == compare )
+  	dest[0] = val;
+  }
+  return retval;
+}
+
+#endif
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+{
+  return compare == atomic_compare_exchange(dest, compare, val);
+}
+
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+#endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@ -0,0 +1,340 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
+#define KOKKOS_ATOMIC_EXCHANGE_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+__inline__ __device__
+int atomic_exchange( volatile int * const dest , const int val )
+{
+  // return __iAtomicExch( (int*) dest , val );
+  return atomicExch( (int*) dest , val );
+}
+
+__inline__ __device__
+unsigned int atomic_exchange( volatile unsigned int * const dest , const unsigned int val )
+{
+  // return __uAtomicExch( (unsigned int*) dest , val );
+  return atomicExch( (unsigned int*) dest , val );
+}
+
+__inline__ __device__
+unsigned long long int atomic_exchange( volatile unsigned long long int * const dest , const unsigned long long int val )
+{
+  // return __ullAtomicExch( (unsigned long long*) dest , val );
+  return atomicExch( (unsigned long long*) dest , val );
+}
+
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
+  int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
+  type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_exchange( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      *dest = val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+}
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // (void) __ullAtomicExch( (int*) dest , *((int*)&val) );
+  (void) atomicExch( ((int*)dest) , *((int*)&val) );
+}
+
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // (void) __ullAtomicExch( (type*) dest , *((type*)&val) );
+  (void) atomicExch( ((type*)dest) , *((type*)&val) );
+}
+
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(unsigned long long int)
+                                  , const T & >::type val )
+{
+  (void) atomic_exchange(dest,val);
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+
+  type assumed ;
+
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    T val_T ;
+    type val_type ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } old ;
+#else
+  union { T val_T ; type val_type ; } old ;
+#endif
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+
+  return old.val_T ;
+}
+
+#if defined(KOKKOS_ENABLE_ASM)
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
+                                  , const T & >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+  newval.t = val;
+
+  do {
+    assume.i = oldval.i ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_exchange( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM)
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  const T tmp = *dest = val;
+  #ifndef KOKKOS_COMPILER_CLANG
+  (void) tmp;
+  #endif
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+
+  type assumed ;
+
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    T val_T ;
+    type val_type ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } old ;
+#else
+  union { T val_T ; type val_type ; } old ;
+#endif
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+}
+
+#ifdef KOKKOS_ENABLE_ASM
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
+                                  , const T & >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+  newval.t = val;
+  do {
+    assume.i = oldval.i ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i);
+  } while ( assume.i != oldval.i );
+}
+#endif
+
+template < typename T >
+inline
+void atomic_assign( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM)
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  *dest = val;
+  Impl::unlock_address_host_space( (void*) dest );
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest , const T val )
+{
+  T retval;
+//#pragma omp atomic capture
+  #pragma omp critical
+  {
+    retval = dest[0];
+    dest[0] = val;
+  }
+  return retval;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest , const T val )
+{
+//#pragma omp atomic
+  #pragma omp critical
+  {
+    dest[0] = val;
+  }
+}
+
+#endif
+
+} // namespace Kokkos
+
+#endif
+
+//----------------------------------------------------------------------------
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@ -0,0 +1,326 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
+#define KOKKOS_ATOMIC_FETCH_ADD_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return atomicAdd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAdd((unsigned int*)dest,val); }
+
+__inline__ __device__
+unsigned long long int atomic_fetch_add( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAdd((unsigned long long int*)dest,val); }
+
+__inline__ __device__
+float atomic_fetch_add( volatile float * const dest , const float val )
+{ return atomicAdd((float*)dest,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    int i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
+  } while ( assumed.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    unsigned long long int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    unsigned long long int i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      *dest = return_val + val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_add( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    int i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    long i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    long i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+#ifdef KOKKOS_ENABLE_ASM
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(long) &&
+                                    sizeof(T) == sizeof(Impl::cas128_t) , const T >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_fetch_add( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM)
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  const T tmp = *dest = return_val + val;
+  #ifndef KOKKOS_COMPILER_CLANG
+  (void) tmp;
+  #endif
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_add( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] += val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_add without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T * const dest, const T src) {
+  atomic_fetch_add(dest,src);
+}
+
+// Atomic increment
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* a) {
+  Kokkos::atomic_fetch_add(a,1);
+}
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* a) {
+  Kokkos::atomic_fetch_add(a,-1);
+}
+
+}
+#endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
+#define KOKKOS_ATOMIC_FETCH_AND_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return atomicAnd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAnd((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_and( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAnd((unsigned long long int*)dest,val); }
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_and( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_and( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] &= val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_and without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_and(volatile T * const dest, const T src) {
+  (void)atomic_fetch_and(dest,src);
+}
+
+}
+
+#endif
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
+#define KOKKOS_ATOMIC_FETCH_OR_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return atomicOr((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicOr((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_or( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicOr((unsigned long long int*)dest,val); }
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_or( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_or( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] |= val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_or without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_or(volatile T * const dest, const T src) {
+  (void)atomic_fetch_or(dest,src);
+}
+
+}
+
+#endif
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@ -0,0 +1,233 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
+#define KOKKOS_ATOMIC_FETCH_SUB_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_sub( volatile int * const dest , const int val )
+{ return atomicSub((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicSub((unsigned int*)dest,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
+  } while ( assumed.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      *dest = return_val - val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_sub( volatile int * const dest , const int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_sub( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T >::type val )
+{
+  union { long i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_fetch_sub( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  *dest = return_val - val;
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_sub( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] -= val;
+  }
+  return retval;
+}
+
+#endif
+
+// Simpler version of atomic_fetch_sub without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_sub(volatile T * const dest, const T src) {
+  atomic_fetch_sub(dest,src);
+}
+
+}
+
+#include<impl/Kokkos_Atomic_Assembly_X86.hpp>
+#endif
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@ -0,0 +1,375 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
+#define KOKKOS_ATOMIC_GENERIC_HPP
+#include <Kokkos_Macros.hpp>
+
+// Combination operands to be used in an Compare and Exchange based atomic operation
+namespace Kokkos {
+namespace Impl {
+
+template<class Scalar1, class Scalar2>
+struct AddOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1+val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct SubOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1-val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct MulOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1*val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct DivOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1/val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct ModOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1%val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct AndOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1&val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct OrOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1|val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct XorOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1^val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct LShiftOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1<<val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct RShiftOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1>>val2;
+  }
+};
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int), const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if<
+                ( sizeof(T) != 4 )
+             && ( sizeof(T) != 8 )
+          #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
+             && ( sizeof(T) != 16 )
+          #endif
+           , const T >::type val )
+{
+
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  *dest = Oper::apply(return_val, val);
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+#else
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      T return_val = *dest;
+      *dest = Oper::apply(return_val, val);;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+#endif
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if<
+                ( sizeof(T) != 4 )
+             && ( sizeof(T) != 8 )
+          #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
+             && ( sizeof(T) != 16 )
+          #endif
+           , const T >::type& val )
+{
+
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = Oper::apply(*dest, val);
+  *dest = return_val;
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+#else
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      T return_val = Oper::apply(*dest, val);
+      *dest = return_val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+#endif
+}
+
+}
+}
+
+namespace Kokkos {
+
+// Fetch_Oper atomics: return value before operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MulOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::DivOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::ModOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::AndOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::OrOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::XorOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(Impl::LShiftOper<T,const unsigned int>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(Impl::RShiftOper<T,const unsigned int>(),dest,val);
+}
+
+
+// Oper Fetch atomics: return value after operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MulOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::DivOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::ModOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::AndOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::OrOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::XorOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(Impl::LShiftOper<T,const unsigned int>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
+}
+
+
+}
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@ -0,0 +1,462 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ATOMIC_VIEW_HPP
+#define KOKKOS_ATOMIC_VIEW_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+
+namespace Kokkos { namespace Impl {
+
+class AllocationTracker;
+
+//The following tag is used to prevent an implicit call of the constructor when trying
+//to assign a literal 0 int ( = 0 );
+struct AtomicViewConstTag {};
+
+template<class ViewTraits>
+class AtomicDataElement {
+public:
+  typedef typename ViewTraits::value_type value_type;
+  typedef typename ViewTraits::const_value_type const_value_type;
+  typedef typename ViewTraits::non_const_value_type non_const_value_type;
+  volatile value_type* const ptr;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicDataElement(value_type* ptr_, AtomicViewConstTag ):ptr(ptr_){}
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator = (const_value_type& val) const {
+    *ptr = val;
+    return val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator = (volatile const_value_type& val) const {
+    *ptr = val;
+    return val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void inc() const {
+    Kokkos::atomic_increment(ptr);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void dec() const {
+    Kokkos::atomic_decrement(ptr);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ++ () const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,1);
+    return tmp+1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -- () const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-1);
+    return tmp-1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ++ (int) const {
+    return Kokkos::atomic_fetch_add(ptr,1);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -- (int) const {
+    return Kokkos::atomic_fetch_add(ptr,-1);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator += (const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
+    return tmp+val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator += (volatile const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
+    return tmp+val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -= (const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
+    return tmp-val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -= (volatile const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
+    return tmp-val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator *= (const_value_type& val) const {
+    return Kokkos::atomic_mul_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator *= (volatile const_value_type& val) const {
+    return Kokkos::atomic_mul_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator /= (const_value_type& val) const {
+    return Kokkos::atomic_div_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator /= (volatile const_value_type& val) const {
+    return Kokkos::atomic_div_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator %= (const_value_type& val) const {
+    return Kokkos::atomic_mod_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator %= (volatile const_value_type& val) const {
+    return Kokkos::atomic_mod_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator &= (const_value_type& val) const {
+    return Kokkos::atomic_and_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator &= (volatile const_value_type& val) const {
+    return Kokkos::atomic_and_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^= (const_value_type& val) const {
+    return Kokkos::atomic_xor_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^= (volatile const_value_type& val) const {
+    return Kokkos::atomic_xor_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator |= (const_value_type& val) const {
+    return Kokkos::atomic_or_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator |= (volatile const_value_type& val) const {
+    return Kokkos::atomic_or_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator <<= (const_value_type& val) const {
+    return Kokkos::atomic_lshift_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator <<= (volatile const_value_type& val) const {
+    return Kokkos::atomic_lshift_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >>= (const_value_type& val) const {
+    return Kokkos::atomic_rshift_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >>= (volatile const_value_type& val) const {
+    return Kokkos::atomic_rshift_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator + (const_value_type& val) const {
+    return *ptr+val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator + (volatile const_value_type& val) const {
+    return *ptr+val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator - (const_value_type& val) const {
+    return *ptr-val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator - (volatile const_value_type& val) const {
+    return *ptr-val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator * (const_value_type& val) const {
+    return *ptr*val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator * (volatile const_value_type& val) const {
+    return *ptr*val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator / (const_value_type& val) const {
+    return *ptr/val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator / (volatile const_value_type& val) const {
+    return *ptr/val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator % (const_value_type& val) const {
+    return *ptr^val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator % (volatile const_value_type& val) const {
+    return *ptr^val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ! () const {
+    return !*ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator && (const_value_type& val) const {
+    return *ptr&&val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator && (volatile const_value_type& val) const {
+    return *ptr&&val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator || (const_value_type& val) const {
+    return *ptr|val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator || (volatile const_value_type& val) const {
+    return *ptr|val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator & (const_value_type& val) const {
+    return *ptr&val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator & (volatile const_value_type& val) const {
+    return *ptr&val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator | (const_value_type& val) const {
+    return *ptr|val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator | (volatile const_value_type& val) const {
+    return *ptr|val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^ (const_value_type& val) const {
+    return *ptr^val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^ (volatile const_value_type& val) const {
+    return *ptr^val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ~ () const {
+    return ~*ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator << (const unsigned int& val) const {
+    return *ptr<<val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator << (volatile const unsigned int& val) const {
+    return *ptr<<val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >> (const unsigned int& val) const {
+    return *ptr>>val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >> (volatile const unsigned int& val) const {
+    return *ptr>>val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const_value_type& val) const {
+    return *ptr == val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (volatile const_value_type& val) const {
+    return *ptr == val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const_value_type& val) const {
+    return *ptr != val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (volatile const_value_type& val) const {
+    return *ptr != val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator >= (const_value_type& val) const {
+    return *ptr >= val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator >= (volatile const_value_type& val) const {
+    return *ptr >= val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator <= (const_value_type& val) const {
+    return *ptr <= val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator <= (volatile const_value_type& val) const {
+    return *ptr <= val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator < (const_value_type& val) const {
+    return *ptr < val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator < (volatile const_value_type& val) const {
+    return *ptr < val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator > (const_value_type& val) const {
+    return *ptr > val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator > (volatile const_value_type& val) const {
+    return *ptr > val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const_value_type () const {
+    //return Kokkos::atomic_load(ptr);
+    return *ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator volatile non_const_value_type () volatile const {
+    //return Kokkos::atomic_load(ptr);
+    return *ptr;
+  }
+};
+
+template<class ViewTraits>
+class AtomicViewDataHandle {
+public:
+  typename ViewTraits::value_type* ptr;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicViewDataHandle()
+    : ptr(NULL)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicViewDataHandle(typename ViewTraits::value_type* ptr_)
+    :ptr(ptr_)
+  {}
+
+  template<class iType>
+  KOKKOS_INLINE_FUNCTION
+  AtomicDataElement<ViewTraits> operator[] (const iType& i) const {
+    return AtomicDataElement<ViewTraits>(ptr+i,AtomicViewConstTag());
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  operator typename ViewTraits::value_type * () const { return ptr ; }
+
+};
+
+template<unsigned Size>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars;
+
+template<>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<4> {
+  typedef int type;
+};
+
+template<>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
+  typedef int64_t type;
+};
+
+// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
+template<class ViewTraits>
+class ViewDataHandle<
+  ViewTraits ,
+  typename enable_if<
+    ( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
+    ( ViewTraits::memory_traits::Atomic )
+  >::type >
+{
+private:
+//  typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) ||
+//                        (sizeof(typename ViewTraits::const_value_type)==8),
+//                         int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type
+//                   atomic_view_possible;
+  typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
+  typedef ViewDataHandle self_type;
+
+public:
+  enum {  ReturnTypeIsReference = false };
+
+  typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type;
+  typedef Impl::AtomicDataElement<ViewTraits>    return_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
+  {
+    return handle_type(arg_data_ptr);
+  }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
@ -0,0 +1,211 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ATOMIC_WINDOWS_HPP
+#define KOKKOS_ATOMIC_WINDOWS_HPP
+#ifdef _WIN32
+
+#define NOMINMAX
+#include <Windows.h>
+
+namespace Kokkos {
+  namespace Impl {
+    _declspec(align(16))
+    struct cas128_t
+    {
+      LONGLONG lower;
+      LONGLONG upper;
+      KOKKOS_INLINE_FUNCTION
+        bool operator != (const cas128_t& a) const {
+        return (lower != a.lower) || upper != a.upper;
+      }
+    };
+  }
+
+#ifdef KOKKOS_HAVE_CXX11
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONG), const T & >::type val)
+  {
+    union U {
+      LONG i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp;
+
+    tmp.i = _InterlockedCompareExchange((LONG*)dest, *((LONG*)&val), *((LONG*)&compare));
+    return tmp.t;
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONGLONG), const T & >::type val)
+  {
+    union U {
+      LONGLONG i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp;
+
+    tmp.i = _InterlockedCompareExchange64((LONGLONG*)dest, *((LONGLONG*)&val), *((LONGLONG*)&compare));
+    return tmp.t;
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val)
+  {
+    union U {
+      Impl::cas128_t i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp, newval;
+    newval.t = val;
+    tmp.i = _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, newval.i.lower, *((LONGLONG*)&compare));
+    return tmp.t;
+  }
+
+  template< typename T >
+  T atomic_fetch_or(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val | oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_and(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val & oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_add(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val + oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_exchange(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      oldval = atomic_compare_exchange(dest, assume, val);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  void atomic_or(volatile T * const dest, const T val) {
+    atomic_fetch_or(dest, val);
+  }
+
+  template< typename T >
+  void atomic_and(volatile T * const dest, const T val) {
+    atomic_fetch_and(dest, val);
+  }
+
+  template< typename T >
+  void atomic_add(volatile T * const dest, const T val) {
+    atomic_fetch_add(dest, val);
+  }
+
+  template< typename T >
+  void atomic_exchange(volatile T * const dest, const T val) {
+    atomic_fetch_exchange(dest, val);
+  }
+
+  template< typename T >
+  void atomic_assign(volatile T * const dest, const T val) {
+    atomic_fetch_exchange(dest, val);
+  }
+
+  template< typename T >
+  T atomic_increment(volatile T * const dest) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = assume++;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+  }
+
+  template< typename T >
+  T atomic_decrement(volatile T * const dest) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = assume--;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+  }
+
+}
+#endif
+#endif
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
@ -0,0 +1,281 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_HostSpace.hpp>
+
+#include <impl/Kokkos_BasicAllocators.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+
+#include <stdint.h>    // uintptr_t
+#include <cstdlib>     // for malloc, realloc, and free
+#include <cstring>     // for memcpy
+#include <sys/mman.h>  // for mmap, munmap, MAP_ANON, etc
+#include <unistd.h>    // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
+
+#include <sstream>
+
+namespace Kokkos { namespace Impl {
+
+/*--------------------------------------------------------------------------*/
+
+void* MallocAllocator::allocate( size_t size )
+{
+  void * ptr = NULL;
+  if (size) {
+    ptr = malloc(size);
+
+    if (!ptr)
+    {
+      std::ostringstream msg ;
+      msg << name() << ": allocate(" << size << ") FAILED";
+      throw_runtime_exception( msg.str() );
+    }
+  }
+  return ptr;
+}
+
+void MallocAllocator::deallocate( void * ptr, size_t /*size*/ )
+{
+  if (ptr) {
+    free(ptr);
+  }
+}
+
+void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
+{
+  void * ptr = realloc(old_ptr, new_size);
+
+  if (new_size > 0u && ptr == NULL) {
+    throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
+  }
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------*/
+
+namespace {
+
+void * raw_aligned_allocate( size_t size, size_t alignment )
+{
+  void * ptr = NULL;
+  if ( size ) {
+#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
+    ptr = _mm_malloc( size , alignment );
+
+#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
+    ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
+
+    posix_memalign( & ptr, alignment , size );
+
+#else
+    // Over-allocate to and round up to guarantee proper alignment.
+    size_t size_padded = size + alignment + sizeof(void *);
+    void * alloc_ptr = malloc( size_padded );
+
+    if (alloc_ptr) {
+      uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
+      // offset enough to record the alloc_ptr
+      address += sizeof(void *);
+      uintptr_t rem = address % alignment;
+      uintptr_t offset = rem ? (alignment - rem) : 0u;
+      address += offset;
+      ptr = reinterpret_cast<void *>(address);
+      // record the alloc'd pointer
+      address -= sizeof(void *);
+      *reinterpret_cast<void **>(address) = alloc_ptr;
+    }
+#endif
+  }
+  return ptr;
+}
+
+void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
+{
+  if ( ptr ) {
+#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
+    _mm_free( ptr );
+
+#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
+      ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
+    free( ptr );
+#else
+    // get the alloc'd pointer
+    void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1);
+    free( alloc_ptr );
+#endif
+  }
+
+}
+
+}
+
+void* AlignedAllocator::allocate( size_t size )
+{
+  void * ptr = 0 ;
+
+  if ( size ) {
+    ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT);
+
+    if (!ptr)
+    {
+      std::ostringstream msg ;
+      msg << name() << ": allocate(" << size << ") FAILED";
+      throw_runtime_exception( msg.str() );
+    }
+  }
+  return ptr;
+}
+
+void AlignedAllocator::deallocate( void * ptr, size_t size )
+{
+  raw_aligned_deallocate( ptr, size);
+}
+
+void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  void * ptr = old_ptr;;
+
+  if (old_size < new_size) {
+    ptr = allocate( new_size );
+
+    memcpy(ptr, old_ptr, old_size );
+
+    deallocate( old_ptr, old_size );
+  }
+
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------*/
+
+// mmap flags for private anonymous memory allocation
+#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
+  #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
+#elif defined( MAP_ANON) && defined( MAP_PRIVATE )
+  #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
+#else
+  #define NO_MMAP
+#endif
+
+// huge page tables
+#if !defined( NO_MMAP )
+  #if defined( MAP_HUGETLB )
+    #define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB )
+  #elif defined( MMAP_FLAGS )
+    #define MMAP_FLAGS_HUGE MMAP_FLAGS
+  #endif
+  // threshold to use huge pages
+  #define MMAP_USE_HUGE_PAGES (1u << 27)
+#endif
+
+// read write access to private memory
+#if !defined( NO_MMAP )
+  #define MMAP_PROTECTION (PROT_READ | PROT_WRITE)
+#endif
+
+
+void* PageAlignedAllocator::allocate( size_t size )
+{
+  void *ptr = NULL;
+  if (size) {
+#if !defined NO_MMAP
+    if ( size < MMAP_USE_HUGE_PAGES ) {
+      ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/);
+    } else {
+      ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/);
+    }
+    if (ptr == MAP_FAILED) {
+      ptr = NULL;
+    }
+#else
+    static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE )
+
+    ptr = raw_aligned_allocate( size, page_size);
+#endif
+    if (!ptr)
+    {
+      std::ostringstream msg ;
+      msg << name() << ": allocate(" << size << ") FAILED";
+      throw_runtime_exception( msg.str() );
+    }
+  }
+  return ptr;
+}
+
+void PageAlignedAllocator::deallocate( void * ptr, size_t size )
+{
+#if !defined( NO_MMAP )
+  munmap(ptr, size);
+#else
+  raw_aligned_deallocate(ptr, size);
+#endif
+}
+
+void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  void * ptr = NULL;
+#if defined( NO_MMAP ) || defined( __APPLE__ )
+
+  if (old_size != new_size) {
+    ptr = allocate( new_size );
+
+    memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) );
+
+    deallocate( old_ptr, old_size );
+  }
+  else {
+    ptr = old_ptr;
+  }
+#else
+  ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE );
+
+  if (ptr == MAP_FAILED) {
+    throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory");
+  }
+#endif
+
+  return ptr;
+}
+
+}} // namespace Kokkos::Impl
--- a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp
@ -0,0 +1,118 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BASIC_ALLOCATORS_HPP
+#define KOKKOS_BASIC_ALLOCATORS_HPP
+
+
+namespace Kokkos { namespace Impl {
+
+/// class UnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+class UnmanagedAllocator
+{
+public:
+  static const char * name() { return "Unmanaged Allocator"; }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+};
+
+
+/// class MallocAllocator
+class MallocAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Malloc Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t size);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+/// class AlignedAllocator
+/// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT
+class AlignedAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Aligned Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t size);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+/// class PageAlignedAllocator
+/// memory aligned to PAGE_SIZE
+class PageAlignedAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Page Aligned Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t size);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_BASIC_ALLOCATORS_HPP
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -0,0 +1,447 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <cctype>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+bool is_unsigned_int(const char* str)
+{
+  const size_t len = strlen (str);
+  for (size_t i = 0; i < len; ++i) {
+    if (! isdigit (str[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void initialize_internal(const InitArguments& args)
+{
+  // Protect declarations, to prevent "unused variable" warnings.
+#if defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
+  const int num_threads = args.num_threads;
+  const int use_numa = args.num_numa;
+#endif // defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
+#if defined( KOKKOS_HAVE_CUDA )
+  const int use_gpu = args.device_id;
+#endif // defined( KOKKOS_HAVE_CUDA )
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
+    if(num_threads>0) {
+      if(use_numa>0) {
+        Kokkos::OpenMP::initialize(num_threads,use_numa);
+      }
+      else {
+        Kokkos::OpenMP::initialize(num_threads);
+      }
+    } else {
+      Kokkos::OpenMP::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
+  }
+  else {
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
+    if(num_threads>0) {
+      if(use_numa>0) {
+        Kokkos::Threads::initialize(num_threads,use_numa);
+      }
+      else {
+        Kokkos::Threads::initialize(num_threads);
+      }
+    } else {
+      Kokkos::Threads::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: Pthread enabled and initialized" << std::endl ;
+  }
+  else {
+    //std::cout << "Kokkos::initialize() fyi: Pthread enabled but not initialized" << std::endl ;
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+  // Prevent "unused variable" warning for 'args' input struct.  If
+  // Serial::initialize() ever needs to take arguments from the input
+  // struct, you may remove this line of code.
+  (void) args;
+
+  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Serial::initialize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
+    if (use_gpu > -1) {
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( use_gpu ) );
+    }
+    else {
+      Kokkos::Cuda::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: Cuda enabled and initialized" << std::endl ;
+  }
+#endif
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+    Kokkos::Experimental::initialize();
+#endif
+}
+
+void finalize_internal( const bool all_spaces = false )
+{
+
+#if defined( KOKKOS_HAVE_CUDA )
+  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
+    if(Kokkos::Cuda::is_initialized())
+      Kokkos::Cuda::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::OpenMP::is_initialized())
+      Kokkos::OpenMP::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::Threads::is_initialized())
+      Kokkos::Threads::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::Serial::is_initialized())
+      Kokkos::Serial::finalize();
+  }
+#endif
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+    Kokkos::Experimental::finalize();
+#endif
+
+}
+
+void fence_internal()
+{
+
+#if defined( KOKKOS_HAVE_CUDA )
+  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value ) {
+    Kokkos::Cuda::fence();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::OpenMP::fence();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Threads::fence();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Serial::fence();
+  }
+#endif
+
+}
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+void initialize(int& narg, char* arg[])
+{
+    int num_threads = -1;
+    int numa = -1;
+    int device = -1;
+
+    int kokkos_threads_found = 0;
+    int kokkos_numa_found = 0;
+    int kokkos_device_found = 0;
+    int kokkos_ndevices_found = 0;
+
+    int iarg = 0;
+
+    while (iarg < narg) {
+      if ((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || (strncmp(arg[iarg],"--threads",9) == 0)) {
+        //Find the number of threads (expecting --threads=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-threads=",17) == 0) || (strncmp(arg[iarg],"--threads=",10) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        char* number =  strchr(arg[iarg],'=')+1;
+
+        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        if((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || !kokkos_threads_found)
+          num_threads = atoi(number);
+
+        //Remove the --kokkos-threads argument from the list but leave --threads
+        if(strncmp(arg[iarg],"--kokkos-threads",16) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_threads_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || (strncmp(arg[iarg],"--numa",6) == 0)) {
+        //Find the number of numa (expecting --numa=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-numa=",14) == 0) || (strncmp(arg[iarg],"--numa=",7) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        char* number =  strchr(arg[iarg],'=')+1;
+
+        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        if((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || !kokkos_numa_found)
+          numa = atoi(number);
+
+        //Remove the --kokkos-numa argument from the list but leave --numa
+        if(strncmp(arg[iarg],"--kokkos-numa",13) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_numa_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strncmp(arg[iarg],"--kokkos-device",15) == 0) || (strncmp(arg[iarg],"--device",8) == 0)) {
+        //Find the number of device (expecting --device=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-device=",16) == 0) || (strncmp(arg[iarg],"--device=",9) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        char* number =  strchr(arg[iarg],'=')+1;
+
+        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        if((strncmp(arg[iarg],"--kokkos-device",15) == 0) || !kokkos_device_found)
+          device = atoi(number);
+
+        //Remove the --kokkos-device argument from the list but leave --device
+        if(strncmp(arg[iarg],"--kokkos-device",15) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_device_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || (strncmp(arg[iarg],"--ndevices",10) == 0)) {
+
+        //Find the number of device (expecting --device=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT[,INT]' after command line argument '--ndevices/--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        int ndevices=-1;
+        int skip_device = 9999;
+
+        char* num1 = strchr(arg[iarg],'=')+1;
+        char* num2 = strpbrk(num1,",");
+        int num1_len = num2==NULL?strlen(num1):num2-num1;
+        char* num1_only = new char[num1_len+1];
+        strncpy(num1_only,num1,num1_len);
+        num1_only[num1_len]=0;
+
+        if(!Impl::is_unsigned_int(num1_only) || (strlen(num1_only)==0)) {
+          Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+        }
+        if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
+          ndevices = atoi(num1_only);
+
+        if( num2 != NULL ) {
+          if(( !Impl::is_unsigned_int(num2+1) ) || (strlen(num2)==1) )
+            Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices=XX,'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+          if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
+            skip_device = atoi(num2+1);
+        }
+
+        if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found) {
+          char *str;
+          if ((str = getenv("SLURM_LOCALID"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if(device==-1) {
+            device = 0;
+            if (device >= skip_device) device++;
+          }
+        }
+
+        //Remove the --kokkos-ndevices argument from the list but leave --ndevices
+        if(strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_ndevices_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
+         std::cout << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << "-------------Kokkos command line arguments--------------------------------------" << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << "The following arguments exist also without prefix 'kokkos' (e.g. --help)." << std::endl;
+         std::cout << "The prefixed arguments will be removed from the list by Kokkos::initialize()," << std::endl;
+         std::cout << "the non-prefixed ones are not removed. Prefixed versions take precedence over " << std::endl;
+         std::cout << "non prefixed ones, and the last occurence of an argument overwrites prior" << std::endl;
+         std::cout << "settings." << std::endl;
+         std::cout << std::endl;
+         std::cout << "--kokkos-help               : print this message" << std::endl;
+         std::cout << "--kokkos-threads=INT        : specify total number of threads or" << std::endl;
+         std::cout << "                              number of threads per NUMA region if " << std::endl;
+         std::cout << "                              used in conjunction with '--numa' option. " << std::endl;
+         std::cout << "--kokkos-numa=INT           : specify number of NUMA regions used by process." << std::endl;
+         std::cout << "--kokkos-device=INT         : specify device id to be used by Kokkos. " << std::endl;
+         std::cout << "--kokkos-ndevices=INT[,INT] : used when running MPI jobs. Specify number of" << std::endl;
+         std::cout << "                              devices per node to be used. Process to device" << std::endl;
+         std::cout << "                              mapping happens by obtaining the local MPI rank" << std::endl;
+         std::cout << "                              and assigning devices round-robin. The optional" << std::endl;
+         std::cout << "                              second argument allows for an existing device" << std::endl;
+         std::cout << "                              to be ignored. This is most useful on workstations" << std::endl;
+         std::cout << "                              with multiple GPUs of which one is used to drive" << std::endl;
+         std::cout << "                              screen output." << std::endl;
+         std::cout << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << std::endl;
+
+         //Remove the --kokkos-help argument from the list but leave --ndevices
+         if(strcmp(arg[iarg],"--kokkos-help") == 0) {
+           for(int k=iarg;k<narg-1;k++) {
+             arg[k] = arg[k+1];
+           }
+           narg--;
+         } else {
+           iarg++;
+         }
+      } else
+      iarg++;
+    }
+
+    InitArguments arguments;
+    arguments.num_threads = num_threads;
+    arguments.num_numa = numa;
+    arguments.device_id = device;
+    Impl::initialize_internal(arguments);
+}
+
+void initialize(const InitArguments& arguments) {
+  Impl::initialize_internal(arguments);
+}
+
+void finalize()
+{
+  Impl::finalize_internal();
+}
+
+void finalize_all()
+{
+  enum { all_spaces = true };
+  Impl::finalize_internal( all_spaces );
+}
+
+void fence()
+{
+  Impl::fence_internal();
+}
+
+} // namespace Kokkos
+
--- a/lib/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
@ -0,0 +1,193 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <ostream>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include <impl/Kokkos_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void host_abort( const char * const message )
+{
+  fwrite(message,1,strlen(message),stderr);
+  fflush(stderr);
+  abort();
+}
+
+void throw_runtime_exception( const std::string & msg )
+{
+  std::ostringstream o ;
+  o << msg ;
+  traceback_callstack( o );
+  throw std::runtime_error( o.str() );
+}
+
+
+std::string human_memory_size(size_t arg_bytes)
+{
+  double bytes = arg_bytes;
+  const double K = 1024;
+  const double M = K*1024;
+  const double G = M*1024;
+
+  std::ostringstream out;
+  if (bytes < K) {
+    out << std::setprecision(4) << bytes << " B";
+  } else if (bytes < M) {
+    bytes /= K;
+    out << std::setprecision(4) << bytes << " K";
+  } else if (bytes < G) {
+    bytes /= M;
+    out << std::setprecision(4) << bytes << " M";
+  } else {
+    bytes /= G;
+    out << std::setprecision(4) << bytes << " G";
+  }
+  return out.str();
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __GNUC__ ) && defined( ENABLE_TRACEBACK )
+
+/*  This is only known to work with GNU C++
+ *  Must be compiled with '-rdynamic'
+ *  Must be linked with   '-ldl'
+ */
+
+/* Print call stack into an error stream,
+ * so one knows in which function the error occured.
+ *
+ * Code copied from:
+ *   http://stupefydeveloper.blogspot.com/2008/10/cc-call-stack.html
+ *
+ * License on this site:
+ *   This blog is licensed under a
+ *   Creative Commons Attribution-Share Alike 3.0 Unported License.
+ *
+ *   http://creativecommons.org/licenses/by-sa/3.0/
+ *
+ * Modified to output to std::ostream.
+ */
+#include <signal.h>
+#include <execinfo.h>
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  using namespace abi;
+
+  enum { MAX_DEPTH = 32 };
+
+  void *trace[MAX_DEPTH];
+  Dl_info dlinfo;
+
+  int status;
+
+  int trace_size = backtrace(trace, MAX_DEPTH);
+
+  msg << std::endl << "Call stack {" << std::endl ;
+
+  for (int i=1; i<trace_size; ++i)
+  {
+    if(!dladdr(trace[i], &dlinfo))
+        continue;
+
+    const char * symname = dlinfo.dli_sname;
+
+    char * demangled = __cxa_demangle(symname, NULL, 0, &status);
+
+    if ( status == 0 && demangled ) {
+      symname = demangled;
+    }
+
+    if ( symname && *symname != 0 ) {
+      msg << "  object: " << dlinfo.dli_fname
+          << " function: " << symname
+          << std::endl ;
+    }
+
+    if ( demangled ) {
+        free(demangled);
+    }
+  }
+  msg << "}" ;
+}
+
+}
+}
+
+#else
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  msg << std::endl << "Traceback functionality not available" << std::endl ;
+}
+
+}
+}
+
+#endif
+
--- a/lib/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
@ -0,0 +1,78 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ERROR_HPP
+#define KOKKOS_IMPL_ERROR_HPP
+
+#include <string>
+#include <iosfwd>
+
+namespace Kokkos {
+namespace Impl {
+
+void host_abort( const char * const );
+
+void throw_runtime_exception( const std::string & );
+
+void traceback_callstack( std::ostream & );
+
+std::string human_memory_size(size_t arg_bytes);
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+namespace Kokkos {
+inline
+void abort( const char * const message ) { Kokkos::Impl::host_abort(message); }
+}
+#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@ -0,0 +1,455 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <Kokkos_Macros.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( __INTEL_COMPILER ) && ! defined ( KOKKOS_HAVE_CUDA )
+
+// Intel specialized allocator does not interoperate with CUDA memory allocation
+
+#define KOKKOS_INTEL_MM_ALLOC_AVAILABLE
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
+    ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
+
+#define KOKKOS_POSIX_MEMALIGN_AVAILABLE
+
+#include <unistd.h>
+#include <sys/mman.h>
+
+/* mmap flags for private anonymous memory allocation */
+
+#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
+  #define KOKKOS_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
+#elif defined( MAP_ANON ) && defined( MAP_PRIVATE )
+  #define KOKKOS_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
+#endif
+
+// mmap flags for huge page tables
+#if defined( KOKKOS_POSIX_MMAP_FLAGS )
+  #if defined( MAP_HUGETLB )
+    #define KOKKOS_POSIX_MMAP_FLAGS_HUGE (KOKKOS_POSIX_MMAP_FLAGS | MAP_HUGETLB )
+  #else
+    #define KOKKOS_POSIX_MMAP_FLAGS_HUGE KOKKOS_POSIX_MMAP_FLAGS
+  #endif
+#endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_BasicAllocators.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Atomic.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+
+DeepCopy<HostSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
+{
+  memcpy( dst , src , n );
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace {
+
+static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
+
+typedef int (* QuerySpaceInParallelPtr )();
+
+QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
+int s_in_parallel_query_count = 0 ;
+
+} // namespace <empty>
+
+void HostSpace::register_in_parallel( int (*device_in_parallel)() )
+{
+  if ( 0 == device_in_parallel ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
+  }
+
+  int i = -1 ;
+
+  if ( ! (device_in_parallel)() ) {
+    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
+  }
+
+  if ( i < s_in_parallel_query_count ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
+
+  }
+
+  if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
+
+  }
+
+  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
+
+  if ( i == s_in_parallel_query_count ) {
+    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
+  }
+}
+
+int HostSpace::in_parallel()
+{
+  const int n = s_in_parallel_query_count ;
+
+  int i = 0 ;
+
+  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
+
+  return i < n ;
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+Impl::AllocationTracker HostSpace::allocate_and_track( const std::string & label, const size_t size )
+{
+  return Impl::AllocationTracker( allocator(), size, label );
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/* Default allocation mechanism */
+HostSpace::HostSpace()
+  : m_alloc_mech(
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+      HostSpace::INTEL_MM_ALLOC
+#elif defined( KOKKOS_POSIX_MMAP_FLAGS )
+      HostSpace::POSIX_MMAP
+#elif defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+      HostSpace::POSIX_MEMALIGN
+#else
+      HostSpace::STD_MALLOC
+#endif
+    )
+{}
+
+/* Default allocation mechanism */
+HostSpace::HostSpace( const HostSpace::AllocationMechanism & arg_alloc_mech )
+  : m_alloc_mech( HostSpace::STD_MALLOC )
+{
+  if ( arg_alloc_mech == STD_MALLOC ) {
+    m_alloc_mech = HostSpace::STD_MALLOC ;
+  }
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+  else if ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) {
+    m_alloc_mech = HostSpace::INTEL_MM_ALLOC ;
+  }
+#elif defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+  else if ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) {
+    m_alloc_mech = HostSpace::POSIX_MEMALIGN ;
+  }
+#elif defined( KOKKOS_POSIX_MMAP_FLAGS )
+  else if ( arg_alloc_mech == HostSpace::POSIX_MMAP ) {
+    m_alloc_mech = HostSpace::POSIX_MMAP ;
+  }
+#endif
+  else {
+    const char * const mech =
+      ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) ? "INTEL_MM_ALLOC" : (
+      ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) ? "POSIX_MEMALIGN" : (
+      ( arg_alloc_mech == HostSpace::POSIX_MMAP     ) ? "POSIX_MMAP" : "" ));
+
+    std::string msg ;
+    msg.append("Kokkos::HostSpace ");
+    msg.append(mech);
+    msg.append(" is not available" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void * HostSpace::allocate( const size_t arg_alloc_size ) const
+{
+  static_assert( sizeof(void*) == sizeof(uintptr_t)
+               , "Error sizeof(void*) != sizeof(uintptr_t)" );
+
+  static_assert( Kokkos::Impl::power_of_two< Kokkos::Impl::MEMORY_ALIGNMENT >::value
+               , "Memory alignment must be power of two" );
+
+  constexpr size_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT ;
+  constexpr size_t alignment_mask = alignment - 1 ;
+
+  void * ptr = NULL;
+
+  if ( arg_alloc_size ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      // Over-allocate to and round up to guarantee proper alignment.
+      size_t size_padded = arg_alloc_size + sizeof(void*) + alignment ;
+
+      void * alloc_ptr = malloc( size_padded );
+
+      if (alloc_ptr) {
+        uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
+
+        // offset enough to record the alloc_ptr
+        address += sizeof(void *);
+        uintptr_t rem = address % alignment;
+        uintptr_t offset = rem ? (alignment - rem) : 0u;
+        address += offset;
+        ptr = reinterpret_cast<void *>(address);
+        // record the alloc'd pointer
+        address -= sizeof(void *);
+        *reinterpret_cast<void **>(address) = alloc_ptr;
+      }
+    }
+
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+    else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
+      ptr = _mm_malloc( arg_alloc_size , alignment );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+    else if ( m_alloc_mech == POSIX_MEMALIGN ) {
+      posix_memalign( & ptr, alignment , arg_alloc_size );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MMAP_FLAGS )
+    else if ( m_alloc_mech == POSIX_MMAP ) {
+      constexpr size_t use_huge_pages = (1u << 27);
+      constexpr int    prot  = PROT_READ | PROT_WRITE ;
+      const     int    flags = arg_alloc_size < use_huge_pages
+                             ? KOKKOS_POSIX_MMAP_FLAGS
+                             : KOKKOS_POSIX_MMAP_FLAGS_HUGE ;
+
+      // read write access to private memory
+
+      ptr = mmap( NULL /* address hint, if NULL OS kernel chooses address */
+                , arg_alloc_size /* size in bytes */
+                , prot           /* memory protection */
+                , flags          /* visibility of updates */
+                , -1 /* file descriptor */
+                ,  0 /* offset */
+                );
+
+/* Associated reallocation:
+       ptr = mremap( old_ptr , old_size , new_size , MREMAP_MAYMOVE );
+*/
+    }
+#endif
+  }
+
+  if ( reinterpret_cast<uintptr_t>(ptr) & alignment_mask ) {
+    Kokkos::Impl::throw_runtime_exception( "Kokkos::HostSpace aligned allocation failed" );
+  }
+
+  return ptr;
+}
+
+
+void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
+{
+  if ( arg_alloc_ptr ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
+      free( alloc_ptr );
+    }    
+
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+    else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
+      _mm_free( arg_alloc_ptr );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+    else if ( m_alloc_mech == POSIX_MEMALIGN ) {
+      free( arg_alloc_ptr );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MMAP_FLAGS )
+    else if ( m_alloc_mech == POSIX_MMAP ) {
+      munmap( arg_alloc_ptr , arg_alloc_size );
+    }
+#endif
+
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record ;
+
+void
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+SharedAllocationRecord( const Kokkos::HostSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  // Fill in the Header information
+  RecordBase::m_alloc_ptr->m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void > *
+SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr )
+{
+  typedef SharedAllocationHeader  Header ;
+  typedef SharedAllocationRecord< Kokkos::HostSpace , void >  RecordHost ;
+
+  SharedAllocationHeader const * const head   = Header::get_header( alloc_ptr );
+  RecordHost                   * const record = static_cast< RecordHost * >( head->m_record );
+
+  if ( record->m_alloc_ptr != head ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void >::get_record ERROR" ) );
+  }
+
+  return record ;
+}
+
+// Iterate records to print orphaned memory ...
+void SharedAllocationRecord< Kokkos::HostSpace , void >::
+print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+  const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
+  const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
+}
+
+namespace Impl {
+void init_lock_array_host_space() {
+  static int is_initialized = 0;
+  if(! is_initialized)
+    for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
+      HOST_SPACE_ATOMIC_LOCKS[i] = 0;
+}
+
+bool lock_address_host_space(void* ptr) {
+  return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                                  0 , 1);
+}
+
+void unlock_address_host_space(void* ptr) {
+   atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                    0);
+}
+
+}
+}
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE )
+#define KOKKOS_MEMORY_FENCE
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+KOKKOS_FORCEINLINE_FUNCTION
+void memory_fence()
+{
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+  __threadfence();
+#elif defined( KOKKOS_ATOMICS_USE_GCC ) || \
+      ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ATOMICS_USE_INTEL ) )
+  __sync_synchronize();
+#elif defined( KOKKOS_ATOMICS_USE_INTEL )
+  _mm_mfence();
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+  #pragma omp flush
+#elif defined( KOKKOS_ATOMICS_USE_WINDOWS )
+  MemoryBarrier();
+#else
+ #error "Error: memory_fence() not defined"
+#endif
+}
+
+} // namespace kokkos
+
+#endif
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
@ -0,0 +1,84 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
+#define KOKKOS_PHYSICAL_LAYOUT_HPP
+
+
+#include <Kokkos_View.hpp>
+namespace Kokkos {
+namespace Impl {
+
+
+
+struct PhysicalLayout {
+  enum LayoutType {Left,Right,Scalar,Error};
+  LayoutType layout_type;
+  int rank;
+  long long int stride[8]; //distance between two neighboring elements in a given dimension
+
+  template< class T , class L , class D , class M >
+  PhysicalLayout( const View<T,L,D,M,ViewDefault> & view )
+    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
+                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
+    , rank( view.Rank )
+    {
+      for(int i=0;i<8;i++) stride[i] = 0;
+      view.stride( stride );
+    }
+  #ifdef KOKKOS_HAVE_CUDA
+  template< class T , class L , class D , class M >
+  PhysicalLayout( const View<T,L,D,M,ViewCudaTexture> & view )
+    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
+                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
+    , rank( view.Rank )
+    {
+      for(int i=0;i<8;i++) stride[i] = 0;
+      view.stride( stride );
+    }
+  #endif
+};
+
+}
+}
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
@ -0,0 +1,57 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+*/
+
+#ifndef KOKKOSP_DEVICE_INFO_HPP
+#define KOKKOSP_DEVICE_INFO_HPP
+
+namespace Kokkos {
+namespace Experimental {
+
+    struct KokkosPDeviceInfo {
+        uint32_t deviceID;
+    };
+
+}
+}
+
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@ -0,0 +1,141 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+ */
+
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+#include <string.h>
+
+namespace Kokkos {
+  namespace Experimental {
+    bool profileLibraryLoaded() {
+       	return (NULL != initProfileLibrary);
+    }
+
+    void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+        if(NULL != beginForCallee) {
+            Kokkos::fence();
+            (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
+        }
+    };
+
+    void endParallelFor(const uint64_t kernelID) {
+        if(NULL != endForCallee) {
+            Kokkos::fence();
+            (*endForCallee)(kernelID);
+        }
+    };
+
+    void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+        if(NULL != beginScanCallee) {
+            Kokkos::fence();
+            (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
+        }
+    };
+
+    void endParallelScan(const uint64_t kernelID) {
+        if(NULL != endScanCallee) {
+            Kokkos::fence();
+            (*endScanCallee)(kernelID);
+        }
+    };
+    
+    void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+        if(NULL != beginReduceCallee) {
+            Kokkos::fence();
+            (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
+        }
+    };
+    
+    void endParallelReduce(const uint64_t kernelID) {
+        if(NULL != endReduceCallee) {
+            Kokkos::fence();
+            (*endReduceCallee)(kernelID);
+        }
+    };
+    
+    void initialize() {
+        void* firstProfileLibrary;
+
+        char* envProfileLibrary  = getenv("KOKKOS_PROFILE_LIBRARY");
+	char* profileLibraryName = strtok(envProfileLibrary, ";");
+
+        if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
+            firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
+
+            if(NULL == firstProfileLibrary) {
+                std::cerr << "Error: Unable to load KokkosP library: " <<
+                profileLibraryName << std::endl;
+            } else {
+                std::cout << "KOKKOSP: Library Loaded: " << profileLibraryName << std::endl;
+
+                beginForCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
+                beginScanCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
+                beginReduceCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
+
+                endScanCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
+                endForCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
+                endReduceCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
+
+                initProfileLibrary = (initFunction) dlsym(firstProfileLibrary, "kokkosp_init_library");
+                finalizeProfileLibrary = (finalizeFunction) dlsym(firstProfileLibrary, "kokkosp_finalize_library");
+            }
+        }
+
+        if(NULL != initProfileLibrary) {
+            (*initProfileLibrary)(0,
+		(uint64_t) KOKKOSP_INTERFACE_VERSION,
+		(uint32_t) 0,
+		NULL);
+        }
+    };
+
+    void finalize() {
+        if(NULL != finalizeProfileLibrary) {
+            (*finalizeProfileLibrary)();
+        }
+    };
+  }
+}
+
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@ -0,0 +1,98 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+ */
+
+#ifndef KOKKOSP_INTERFACE_HPP
+#define KOKKOSP_INTERFACE_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Macros.hpp>
+#include <string>
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+#include <impl/Kokkos_Profiling_DeviceInfo.hpp>
+#include <dlfcn.h>
+#include <iostream>
+#include <stdlib.h>
+#endif
+
+#define KOKKOSP_INTERFACE_VERSION 20150628
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+namespace Kokkos {
+  namespace Experimental {
+
+    typedef void (*initFunction)(const int,
+	const uint64_t,
+	const uint32_t,
+	KokkosPDeviceInfo*);
+    typedef void (*finalizeFunction)();
+    typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
+    typedef void (*endFunction)(uint64_t);
+
+    static initFunction initProfileLibrary = NULL;
+    static finalizeFunction finalizeProfileLibrary = NULL;
+    static beginFunction beginForCallee = NULL;
+    static beginFunction beginScanCallee = NULL;
+    static beginFunction beginReduceCallee = NULL;
+    static endFunction endForCallee = NULL;
+    static endFunction endScanCallee = NULL;
+    static endFunction endReduceCallee = NULL;
+
+    bool profileLibraryLoaded();
+
+    void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+    void endParallelFor(const uint64_t kernelID);
+    void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+    void endParallelScan(const uint64_t kernelID);
+    void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+    void endParallelReduce(const uint64_t kernelID);
+
+    void initialize();
+    void finalize();
+
+  }
+}
+
+#endif
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <sstream>
+#include <Kokkos_Serial.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL )
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace SerialImpl {
+
+Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
+
+Sentinel::~Sentinel()
+{
+  if ( m_scratch ) { free( m_scratch ); }
+  m_scratch = 0 ;
+  m_reduce_end = 0 ;
+  m_shared_end = 0 ;
+}
+
+Sentinel & Sentinel::singleton()
+{
+  static Sentinel s ; return s ;
+}
+
+inline
+unsigned align( unsigned n )
+{
+  enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
+  return ( n + MASK ) & ~MASK ;
+}
+
+} // namespace
+
+SerialTeamMember::SerialTeamMember( int arg_league_rank
+                                  , int arg_league_size
+                                  , int arg_shared_size
+                                  )
+  : m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
+           , arg_shared_size )
+  , m_league_rank( arg_league_rank )
+  , m_league_size( arg_league_size )
+{}
+
+} // namespace Impl
+
+void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
+{
+  static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
+
+  reduce_size = Impl::SerialImpl::align( reduce_size );
+  shared_size = Impl::SerialImpl::align( shared_size );
+
+  if ( ( s.m_reduce_end < reduce_size ) ||
+       ( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
+
+    if ( s.m_scratch ) { free( s.m_scratch ); }
+
+    if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
+    if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
+
+    s.m_scratch = malloc( s.m_shared_end );
+  }
+
+  return s.m_scratch ;
+}
+
+} // namespace Kokkos
+
+#endif // defined( KOKKOS_HAVE_SERIAL )
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
@ -0,0 +1,336 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <impl/Kokkos_Serial_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL )
+#include <stdlib.h>
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Serial >::member_type &
+TaskPolicy< Kokkos::Serial >::member_single()
+{
+  static member_type s(0,1,0); 
+  return s ;
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember<  Kokkos::Serial , void , void > Task ;
+
+//----------------------------------------------------------------------------
+
+namespace {
+
+inline
+unsigned padded_sizeof_derived( unsigned sizeof_derived )
+{
+  return sizeof_derived +
+    ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
+}
+
+} // namespace
+
+void Task::deallocate( void * ptr )
+{
+  free( ptr );
+}
+
+void * Task::allocate( const unsigned arg_sizeof_derived
+                     , const unsigned arg_dependence_capacity )
+{
+  return malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
+}
+
+Task::~TaskMember()
+{
+
+}
+
+Task::TaskMember( const Task::function_verify_type   arg_verify
+                , const Task::function_dealloc_type  arg_dealloc
+                , const Task::function_apply_type    arg_apply
+                , const unsigned                     arg_sizeof_derived
+                , const unsigned                     arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  arg_verify )
+  , m_apply(   arg_apply )
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_wait( 0 )
+  , m_next( 0 )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( TASK_STATE_CONSTRUCTING )
+{
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+Task::TaskMember( const Task::function_dealloc_type  arg_dealloc
+                , const Task::function_apply_type    arg_apply
+                , const unsigned                     arg_sizeof_derived
+                , const unsigned                     arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  & Task::verify_type<void> )
+  , m_apply(   arg_apply )
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_wait( 0 )
+  , m_next( 0 )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( TASK_STATE_CONSTRUCTING )
+{
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void Task::throw_error_add_dependence() const
+{
+  std::cerr << "TaskMember< Serial >::add_dependence ERROR"
+            << " state(" << m_state << ")"
+            << " dep_size(" << m_dep_size << ")"
+            << std::endl ;
+  throw std::runtime_error("TaskMember< Serial >::add_dependence ERROR");
+}
+
+void Task::throw_error_verify_type()
+{
+  throw std::runtime_error("TaskMember< Serial >::verify_type ERROR");
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
+{
+  static const char msg_error_header[]      = "Kokkos::Experimental::Impl::TaskManager<Kokkos::Serial>::assign ERROR" ;
+  static const char msg_error_count[]       = ": negative reference count" ;
+  static const char msg_error_complete[]    = ": destroy task that is not complete" ;
+  static const char msg_error_dependences[] = ": destroy task that has dependences" ;
+  static const char msg_error_exception[]   = ": caught internal exception" ;
+
+  const char * msg_error = 0 ;
+
+  try {
+
+    if ( *lhs ) {
+
+      const int count = --((**lhs).m_ref_count);
+
+      if ( 0 == count ) {
+
+        // Reference count at zero, delete it
+
+        // Should only be deallocating a completed task
+        if ( (**lhs).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) {
+
+          // A completed task should not have dependences...
+          for ( int i = 0 ; i < (**lhs).m_dep_size && 0 == msg_error ; ++i ) {
+            if ( (**lhs).m_dep[i] ) msg_error = msg_error_dependences ;
+          }
+        }
+        else {
+          msg_error = msg_error_complete ;
+        }
+
+        if ( 0 == msg_error ) {
+          // Get deletion function and apply it
+          const Task::function_dealloc_type d = (**lhs).m_dealloc ;
+
+          (*d)( *lhs );
+        }
+      }
+      else if ( count <= 0 ) {
+        msg_error = msg_error_count ;
+      }
+    }
+
+    if ( 0 == msg_error && rhs ) { ++( rhs->m_ref_count ); }
+
+    *lhs = rhs ;
+  }
+  catch( ... ) {
+    if ( 0 == msg_error ) msg_error = msg_error_exception ;
+  }
+
+  if ( 0 != msg_error ) {
+    if ( no_throw ) {
+      std::cerr << msg_error_header << msg_error << std::endl ;
+      std::cerr.flush();
+    }
+    else {
+      std::string msg(msg_error_header);
+      msg.append(msg_error);
+      throw std::runtime_error( msg );
+    }
+  }
+}
+#endif
+
+namespace {
+
+Task * s_ready = 0 ;
+Task * s_denied = reinterpret_cast<Task*>( ~((unsigned long)0) );
+
+}
+
+void Task::schedule()
+{
+  // Execute ready tasks in case the task being scheduled
+  // is dependent upon a waiting and ready task.
+
+  Task::execute_ready_tasks();
+
+  // spawning   : Constructing -> Waiting
+  // respawning : Executing    -> Waiting
+  // updating   : Waiting      -> Waiting
+
+  // Must not be in a dependence linked list:  0 == t->m_next
+
+  const bool ok_state = TASK_STATE_COMPLETE != m_state ;
+  const bool ok_list  = 0 == m_next ;
+
+  if ( ok_state && ok_list ) {
+
+    // Will be waiting for execution upon return from this function
+
+    m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+
+    // Insert this task into another dependence that is not complete
+
+    int i = 0 ;
+    for ( ; i < m_dep_size ; ++i ) {
+      Task * const y = m_dep[i] ;
+      if ( y && s_denied != ( m_next = y->m_wait ) ) {
+        y->m_wait = this ; // CAS( & y->m_wait , m_next , this );
+        break ;
+      }
+    }
+    if ( i == m_dep_size ) {
+      // All dependences are complete, insert into the ready list
+      m_next  = s_ready ;
+      s_ready = this ; // CAS( & s_ready , m_next = s_ready , this );
+    }
+  }
+  else {
+    throw std::runtime_error(std::string("Kokkos::Experimental::Impl::Task spawn or respawn state error"));
+  }
+}
+
+void Task::execute_ready_tasks()
+{
+  while ( s_ready ) {
+
+    // Remove this task from the ready list
+
+    // Task * task ;
+    // while ( ! CAS( & s_ready , task = s_ready , s_ready->m_next ) );
+
+    Task * const task = s_ready ;
+    s_ready = task->m_next ;
+
+    task->m_next = 0 ;
+
+    // precondition: task->m_state = TASK_STATE_WAITING
+    // precondition: task->m_dep[i]->m_state == TASK_STATE_COMPLETE  for all i
+    // precondition: does not exist T such that T->m_wait = task
+    // precondition: does not exist T such that T->m_next = task
+
+    task->m_state = Kokkos::Experimental::TASK_STATE_EXECUTING ;
+
+    (*task->m_apply)( task );
+
+    if ( task->m_state == Kokkos::Experimental::TASK_STATE_EXECUTING ) {
+      // task did not respawn itself
+      task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
+
+      // release dependences:
+      for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
+        assign( task->m_dep + i , 0 );
+      }
+
+      // Stop other tasks from adding themselves to 'task->m_wait' ;
+
+      Task * x ;
+      // CAS( & task->m_wait , x = task->m_wait , s_denied );
+      x = task->m_wait ; task->m_wait = s_denied ;
+
+      // update tasks waiting on this task
+      while ( x ) {
+        Task * const next = x->m_next ;
+
+        x->m_next = 0 ;
+
+        x->schedule(); // could happen concurrently
+
+        x = next ;
+      }
+    }
+  }
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif // defined( KOKKOS_HAVE_SERIAL )
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
@ -0,0 +1,845 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_SERIAL_TASKPOLICY_HPP
+#define KOKKOS_SERIAL_TASKPOLICY_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_HAVE_SERIAL )
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+#include <Kokkos_Serial.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_View.hpp>
+
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+/*  Inheritance structure to allow static_cast from the task root type
+ *  and a task's FunctorType.
+ *
+ *    task_root_type == TaskMember< Space , void , void >
+ *
+ *    TaskMember< PolicyType , ResultType , FunctorType >
+ *      : TaskMember< PolicyType::Space , ResultType , FunctorType >
+ *      { ... };
+ *
+ *    TaskMember< Space , ResultType , FunctorType >
+ *      : TaskMember< Space , ResultType , void >
+ *      , FunctorType
+ *      { ... };
+ *
+ *  when ResultType != void
+ *
+ *    TaskMember< Space , ResultType , void >
+ *      : TaskMember< Space , void , void >
+ *      { ... };
+ *
+ */
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  Base class for all tasks in the Serial execution space */
+template<>
+class TaskMember< Kokkos::Serial , void , void >
+{
+public:
+
+  typedef void         (* function_apply_type)  ( TaskMember * );
+  typedef void         (* function_dealloc_type)( TaskMember * );
+  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+
+private:
+
+  const function_dealloc_type  m_dealloc ; ///< Deallocation
+  const function_verify_type   m_verify ;  ///< Result type verification
+  const function_apply_type    m_apply ;   ///< Apply function
+  TaskMember ** const          m_dep ;     ///< Dependences
+  TaskMember *                 m_wait ;    ///< Linked list of tasks waiting on this task
+  TaskMember *                 m_next ;    ///< Linked list of tasks waiting on a different task
+  const int                    m_dep_capacity ; ///< Capacity of dependences
+  int                          m_dep_size ;     ///< Actual count of dependences
+  int                          m_ref_count ;    ///< Reference count
+  int                          m_state ;        ///< State of the task
+
+  // size = 6 Pointers + 4 ints
+
+  TaskMember() /* = delete */ ;
+  TaskMember( const TaskMember & ) /* = delete */ ;
+  TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
+
+  static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
+  static void   deallocate( void * );
+
+  void throw_error_add_dependence() const ;
+  static void throw_error_verify_type();
+
+  template < class DerivedTaskType >
+  static
+  void deallocate( TaskMember * t )
+    {
+      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
+      ptr->~DerivedTaskType();
+      deallocate( (void *) ptr );
+    }
+
+protected :
+
+  ~TaskMember();
+
+  // Used by TaskMember< Serial , ResultType , void >
+  TaskMember( const function_verify_type   arg_verify
+            , const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            );
+
+  // Used for TaskMember< Serial , void , void >
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            );
+
+public:
+
+  template< typename ResultType >
+  KOKKOS_FUNCTION static
+  TaskMember * verify_type( TaskMember * t )
+    {
+      enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
+
+      if ( check_type && t != 0 ) {
+
+        // Verify that t->m_verify is this function
+        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
+
+        if ( t->m_verify != self ) {
+          t = 0 ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+          throw_error_verify_type();
+#endif
+        }
+      }
+      return t ;
+    }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *    typedef  FunctorType::value_type  value_type ;
+   *    class DerivedTaskType
+   *      : public TaskMember< Serial , value_type , FunctorType >
+   *      { ... };
+   *    class TaskMember< Serial , value_type , FunctorType >
+   *      : public TaskMember< Serial , value_type , void >
+   *      , public Functor
+   *      { ... };
+   *  If value_type != void
+   *    class TaskMember< Serial , value_type , void >
+   *      : public TaskMember< Serial , void , void >
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   *
+   */
+
+  /** \brief  Allocate and construct a single-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create( const typename DerivedTaskType::functor_type &  arg_functor
+                     , const unsigned                                  arg_dependence_capacity
+                     )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , & TaskMember::template apply_single< functor_type , value_type >
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  /** \brief  Allocate and construct a data parallel task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create( const typename DerivedTaskType::policy_type &   arg_policy
+                     , const typename DerivedTaskType::functor_type &  arg_functor
+                     , const unsigned                                  arg_dependence_capacity
+                     )
+    {
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_policy
+                         , arg_functor
+                         );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  /** \brief  Allocate and construct a thread-team task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_team( const typename DerivedTaskType::functor_type &  arg_functor
+                          , const unsigned                                  arg_dependence_capacity
+                          )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , & TaskMember::template apply_team< functor_type , value_type >
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  void schedule();
+  static void execute_ready_tasks();
+
+  //----------------------------------------
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+  //----------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
+#else
+  KOKKOS_INLINE_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  TaskMember * get_dependence( int i ) const
+    { return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence() const
+    { return m_dep_size ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence()
+    {
+      for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
+      m_dep_size = 0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskMember * before )
+    {
+      if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state ||
+             Kokkos::Experimental::TASK_STATE_EXECUTING    == m_state ) &&
+           m_dep_size < m_dep_capacity ) {
+        assign( m_dep + m_dep_size , before );
+        ++m_dep_size ;
+      }
+      else {
+        throw_error_add_dependence();
+      }
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  static
+  void apply_team( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+      typedef Kokkos::Impl::SerialTeamMember                          member_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member_type(0,1,0) , m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  static
+  void apply_team( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+      typedef Kokkos::Impl::SerialTeamMember                          member_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member_type(0,1,0) );
+    }
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Base class for tasks with a result value in the Serial execution space.
+ *
+ *  The FunctorType must be void because this class is accessed by the
+ *  Future class for the task and result value.
+ *
+ *  Must be derived from TaskMember<S,void,void> 'root class' so the Future class
+ *  can correctly static_cast from the 'root class' to this class.
+ */
+template < class ResultType >
+class TaskMember< Kokkos::Serial , ResultType , void >
+  : public TaskMember< Kokkos::Serial , void , void >
+{
+public:
+
+  ResultType  m_result ;
+
+  typedef const ResultType & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+protected:
+
+  typedef TaskMember< Kokkos::Serial , void , void >  task_root_type ;
+  typedef task_root_type::function_dealloc_type       function_dealloc_type ;
+  typedef task_root_type::function_apply_type         function_apply_type ;
+
+  inline
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            )
+    : task_root_type( & task_root_type::template verify_type< ResultType >
+                    , arg_dealloc
+                    , arg_apply
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , m_result()
+    {}
+};
+
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Serial , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Serial , ResultType , void >
+  , public FunctorType
+{
+public:
+
+  typedef FunctorType  functor_type ;
+
+  typedef TaskMember< Kokkos::Serial , void , void >        task_root_type ;
+  typedef TaskMember< Kokkos::Serial , ResultType , void >  task_base_type ;
+  typedef task_root_type::function_dealloc_type             function_dealloc_type ;
+  typedef task_root_type::function_apply_type               function_apply_type ;
+
+  inline
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            , const functor_type &         arg_functor
+            )
+    : task_base_type( arg_dealloc , arg_apply , arg_sizeof_derived , arg_dependence_capacity )
+    , functor_type( arg_functor )
+    {}
+};
+
+//----------------------------------------------------------------------------
+/** \brief  ForEach task in the Serial execution space
+ *
+ *  Derived from TaskMember< Kokkos::Serial , ResultType , FunctorType >
+ *  so that Functor can be cast to task root type without knowing policy.
+ */
+template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType >
+class TaskForEach< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
+                 , ResultType
+                 , FunctorType >
+  : TaskMember< Kokkos::Serial , ResultType , FunctorType >
+{
+public:
+
+  typedef FunctorType                                              functor_type ;
+  typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >       policy_type ;
+
+private:
+
+  friend class Kokkos::Experimental::TaskPolicy< Kokkos::Serial > ;
+  friend class Kokkos::Experimental::Impl::TaskMember< Kokkos::Serial , void , void > ;
+
+  typedef TaskMember< Kokkos::Serial , void , void >               task_root_type ;
+  typedef TaskMember< Kokkos::Serial , ResultType , FunctorType >  task_base_type ;
+  typedef task_root_type::function_dealloc_type                    function_dealloc_type ;
+
+  policy_type  m_policy ;
+
+  template< class Tag >
+  inline
+  typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same<Tag,void>::value >::type
+    apply_policy() const
+    {
+      const typename policy_type::member_type e = m_policy.end();
+      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        functor_type::operator()(i);
+      }
+    }
+
+  template< class Tag >
+  inline
+  typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same<Tag,void>::value >::type
+    apply_policy() const
+    {
+      const Tag tag ;
+      const typename policy_type::member_type e = m_policy.end();
+      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        functor_type::operator()(tag,i);
+      }
+    }
+
+  static
+  void apply_parallel( task_root_type * t )
+    {
+      static_cast<TaskForEach*>(t)->template apply_policy< typename policy_type::work_tag >();
+
+      task_root_type::template apply_single< functor_type , ResultType >( t );
+    }
+
+  TaskForEach( const function_dealloc_type  arg_dealloc
+             , const int                    arg_sizeof_derived
+             , const int                    arg_dependence_capacity
+             , const policy_type &          arg_policy
+             , const functor_type &         arg_functor
+             )
+    : task_base_type( arg_dealloc
+                    , & apply_parallel
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity
+                    , arg_functor )
+    , m_policy( arg_policy )
+    {}
+
+  TaskForEach() /* = delete */ ;
+  TaskForEach( const TaskForEach & ) /* = delete */ ;
+  TaskForEach & operator = ( const TaskForEach & ) /* = delete */ ;
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Reduce task in the Serial execution space
+ *
+ *  Derived from TaskMember< Kokkos::Serial , ResultType , FunctorType >
+ *  so that Functor can be cast to task root type without knowing policy.
+ */
+template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType >
+class TaskReduce< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
+                , ResultType
+                , FunctorType >
+  : TaskMember< Kokkos::Serial , ResultType , FunctorType >
+{
+public:
+
+  typedef FunctorType                                              functor_type ;
+  typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >       policy_type ;
+
+private:
+
+  friend class Kokkos::Experimental::TaskPolicy< Kokkos::Serial > ;
+  friend class Kokkos::Experimental::Impl::TaskMember< Kokkos::Serial , void , void > ;
+
+  typedef TaskMember< Kokkos::Serial , void , void >               task_root_type ;
+  typedef TaskMember< Kokkos::Serial , ResultType , FunctorType >  task_base_type ;
+  typedef task_root_type::function_dealloc_type                    function_dealloc_type ;
+
+  policy_type  m_policy ;
+
+  template< class Tag >
+  inline
+  void apply_policy( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same<Tag,void>::value , ResultType & >::type result ) const
+    {
+      Kokkos::Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result );
+      const typename policy_type::member_type e = m_policy.end();
+      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        functor_type::operator()( i, result );
+      }
+    }
+
+  template< class Tag >
+  inline
+  void apply_policy( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same<Tag,void>::value , ResultType & >::type result ) const
+    {
+      Kokkos::Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result );
+      const Tag tag ;
+      const typename policy_type::member_type e = m_policy.end();
+      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        functor_type::operator()( tag, i, result );
+      }
+    }
+
+  static
+  void apply_parallel( task_root_type * t )
+    {
+      TaskReduce * const task = static_cast<TaskReduce*>(t);
+
+      task->template apply_policy< typename policy_type::work_tag >( task->task_base_type::m_result );
+
+      task_root_type::template apply_single< functor_type , ResultType >( t );
+    }
+
+  TaskReduce( const function_dealloc_type  arg_dealloc
+            , const int                    arg_sizeof_derived
+            , const int                    arg_dependence_capacity
+            , const policy_type &          arg_policy
+            , const functor_type &         arg_functor
+            )
+    : task_base_type( arg_dealloc
+                    , & apply_parallel
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity
+                    , arg_functor )
+    , m_policy( arg_policy )
+    {}
+
+  TaskReduce() /* = delete */ ;
+  TaskReduce( const TaskReduce & ) /* = delete */ ;
+  TaskReduce & operator = ( const TaskReduce & ) /* = delete */ ;
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+template<>
+class TaskPolicy< Kokkos::Serial >
+{
+public:
+
+  typedef Kokkos::Serial                  execution_space ;
+  typedef Kokkos::Impl::SerialTeamMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
+
+  template< class FunctorType >
+  static inline
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  static inline
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+  unsigned m_default_dependence_capacity ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy() : m_default_dependence_capacity(4) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy & rhs ) : m_default_dependence_capacity( rhs.m_default_dependence_capacity ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  explicit
+  TaskPolicy( const unsigned arg_default_dependence_capacity )
+    : m_default_dependence_capacity( arg_default_dependence_capacity ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy &
+            , const unsigned arg_default_dependence_capacity )
+    : m_default_dependence_capacity( arg_default_dependence_capacity ) {}
+
+  TaskPolicy & operator = ( const TaskPolicy &rhs ) 
+    {
+      m_default_dependence_capacity = rhs.m_default_dependence_capacity;
+      return *this;
+    }
+
+  //----------------------------------------
+
+  template< class ValueType >
+  KOKKOS_INLINE_FUNCTION
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f ) const
+      {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        f.m_task->schedule();
+#endif
+        return f ;
+      }
+
+  // Create single-thread task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create( const FunctorType & functor
+        , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type >(
+          functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_team( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_team< task_type >(
+          functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
+#endif
+        );
+    }
+
+  // Create parallel foreach task
+
+  template< class PolicyType , class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_foreach( const PolicyType  & policy
+                , const FunctorType & functor
+                , const unsigned      dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskForEach< PolicyType , value_type , FunctorType > task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type >( policy , functor ,
+          ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
+#endif
+       );
+    }
+
+  // Create parallel reduce task
+
+  template< class PolicyType , class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_reduce( const PolicyType  & policy
+               , const FunctorType & functor
+               , const unsigned      dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskReduce< PolicyType , value_type , FunctorType > task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type >( policy , functor ,
+          ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
+#endif
+        );
+    }
+
+  // Add dependence
+  template< class A1 , class A2 , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      after.m_task->add_dependence( before.m_task );
+#endif
+    }
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        get_task_root(task_functor)->get_dependence(i)
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence( const FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return get_task_root(task_functor)->get_dependence(); }
+#else
+    { return 0 ; }
+#endif
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->clear_dependence(); }
+#else
+    {}
+#endif
+
+  template< class FunctorType , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->add_dependence( before.m_task ); }
+#else
+    {}
+#endif
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void respawn( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->schedule(); }
+#else
+    {}
+#endif
+
+  //----------------------------------------
+
+  static member_type & member_single();
+};
+
+inline
+void wait( TaskPolicy< Kokkos::Serial > & )
+{ Impl::TaskMember< Kokkos::Serial , void , void >::execute_ready_tasks(); }
+
+} /* namespace Experimental */
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif /* defined( KOKKOS_HAVE_SERIAL ) */
+#endif /* #define KOKKOS_SERIAL_TASK_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
@ -0,0 +1,178 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <sstream>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void assert_counts_are_equal_throw(
+  const size_t x_count ,
+  const size_t y_count )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_counts_are_equal_throw( "
+      << x_count << " != " << y_count << " )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void assert_shapes_are_equal_throw(
+  const unsigned x_scalar_size ,
+  const unsigned x_rank ,
+  const size_t   x_N0 , const unsigned x_N1 ,
+  const unsigned x_N2 , const unsigned x_N3 ,
+  const unsigned x_N4 , const unsigned x_N5 ,
+  const unsigned x_N6 , const unsigned x_N7 ,
+
+  const unsigned y_scalar_size ,
+  const unsigned y_rank ,
+  const size_t   y_N0 , const unsigned y_N1 ,
+  const unsigned y_N2 , const unsigned y_N3 ,
+  const unsigned y_N4 , const unsigned y_N5 ,
+  const unsigned y_N6 , const unsigned y_N7 )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_shape_are_equal_throw( {"
+      << " scalar_size(" << x_scalar_size
+      << ") rank(" << x_rank
+      << ") dimension(" ;
+  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
+  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
+  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
+  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
+  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
+  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
+  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
+  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
+  msg << " ) } != { "
+      << " scalar_size(" << y_scalar_size
+      << ") rank(" << y_rank
+      << ") dimension(" ;
+  if ( 0 < y_rank ) { msg << " " << y_N0 ; }
+  if ( 1 < y_rank ) { msg << " " << y_N1 ; }
+  if ( 2 < y_rank ) { msg << " " << y_N2 ; }
+  if ( 3 < y_rank ) { msg << " " << y_N3 ; }
+  if ( 4 < y_rank ) { msg << " " << y_N4 ; }
+  if ( 5 < y_rank ) { msg << " " << y_N5 ; }
+  if ( 6 < y_rank ) { msg << " " << y_N6 ; }
+  if ( 7 < y_rank ) { msg << " " << y_N7 ; }
+  msg << " ) } )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void AssertShapeBoundsAbort< Kokkos::HostSpace >::apply(
+  const size_t rank ,
+  const size_t n0 , const size_t n1 , 
+  const size_t n2 , const size_t n3 ,
+  const size_t n4 , const size_t n5 ,
+  const size_t n6 , const size_t n7 ,
+
+  const size_t arg_rank ,
+  const size_t i0 , const size_t i1 ,
+  const size_t i2 , const size_t i3 ,
+  const size_t i4 , const size_t i5 ,
+  const size_t i6 , const size_t i7 )
+{
+  std::ostringstream msg ;
+  msg << "Kokkos::Impl::AssertShapeBoundsAbort( shape = {" ;
+  if ( 0 < rank ) { msg << " " << n0 ; }
+  if ( 1 < rank ) { msg << " " << n1 ; }
+  if ( 2 < rank ) { msg << " " << n2 ; }
+  if ( 3 < rank ) { msg << " " << n3 ; }
+  if ( 4 < rank ) { msg << " " << n4 ; }
+  if ( 5 < rank ) { msg << " " << n5 ; }
+  if ( 6 < rank ) { msg << " " << n6 ; }
+  if ( 7 < rank ) { msg << " " << n7 ; }
+  msg << " } index = {" ;
+  if ( 0 < arg_rank ) { msg << " " << i0 ; }
+  if ( 1 < arg_rank ) { msg << " " << i1 ; }
+  if ( 2 < arg_rank ) { msg << " " << i2 ; }
+  if ( 3 < arg_rank ) { msg << " " << i3 ; }
+  if ( 4 < arg_rank ) { msg << " " << i4 ; }
+  if ( 5 < arg_rank ) { msg << " " << i5 ; }
+  if ( 6 < arg_rank ) { msg << " " << i6 ; }
+  if ( 7 < arg_rank ) { msg << " " << i7 ; }
+  msg << " } )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void assert_shape_effective_rank1_at_leastN_throw(
+  const size_t x_rank , const size_t x_N0 ,
+  const size_t x_N1 ,   const size_t x_N2 ,
+  const size_t x_N3 ,   const size_t x_N4 ,
+  const size_t x_N5 ,   const size_t x_N6 ,
+  const size_t x_N7 ,
+  const size_t N0 )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_shape_effective_rank1_at_leastN_throw( shape = {" ;
+  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
+  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
+  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
+  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
+  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
+  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
+  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
+  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
+  msg << " } N = " << N0 << " )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+
+
+}
+}
+
--- a/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
@ -0,0 +1,917 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SHAPE_HPP
+#define KOKKOS_SHAPE_HPP
+
+#include <typeinfo>
+#include <utility>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  The shape of a Kokkos with dynamic and static dimensions.
+ *          Dynamic dimensions are member values and static dimensions are
+ *          'static const' values.
+ *
+ *  The upper bound on the array rank is eight.
+ */
+template< unsigned ScalarSize ,
+          unsigned Rank ,
+          unsigned s0  = 1 ,
+          unsigned s1  = 1 ,
+          unsigned s2  = 1 ,
+          unsigned s3  = 1 ,
+          unsigned s4  = 1 ,
+          unsigned s5  = 1 ,
+          unsigned s6  = 1 ,
+          unsigned s7  = 1 >
+struct Shape ;
+
+//----------------------------------------------------------------------------
+/** \brief  Shape equality if the value type, layout, and dimensions
+ *          are equal.
+ */
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  enum { same_size = xSize == ySize };
+  enum { same_rank = xRank == yRank };
+
+  return same_size && same_rank &&
+         size_t( x.N0 )   == size_t( y.N0 ) &&
+         unsigned( x.N1 ) == unsigned( y.N1 ) &&
+         unsigned( x.N2 ) == unsigned( y.N2 ) &&
+         unsigned( x.N3 ) == unsigned( y.N3 ) &&
+         unsigned( x.N4 ) == unsigned( y.N4 ) &&
+         unsigned( x.N5 ) == unsigned( y.N5 ) &&
+         unsigned( x.N6 ) == unsigned( y.N6 ) &&
+         unsigned( x.N7 ) == unsigned( y.N7 ) ;
+}
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize ,unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{ return ! operator == ( x , y ); }
+
+//----------------------------------------------------------------------------
+
+void assert_counts_are_equal_throw(
+  const size_t x_count ,
+  const size_t y_count );
+
+inline
+void assert_counts_are_equal(
+  const size_t x_count ,
+  const size_t y_count )
+{
+  if ( x_count != y_count ) {
+    assert_counts_are_equal_throw( x_count , y_count );
+  }
+}
+
+void assert_shapes_are_equal_throw(
+  const unsigned x_scalar_size ,
+  const unsigned x_rank ,
+  const size_t   x_N0 , const unsigned x_N1 ,
+  const unsigned x_N2 , const unsigned x_N3 ,
+  const unsigned x_N4 , const unsigned x_N5 ,
+  const unsigned x_N6 , const unsigned x_N7 ,
+
+  const unsigned y_scalar_size ,
+  const unsigned y_rank ,
+  const size_t   y_N0 , const unsigned y_N1 ,
+  const unsigned y_N2 , const unsigned y_N3 ,
+  const unsigned y_N4 , const unsigned y_N5 ,
+  const unsigned y_N6 , const unsigned y_N7 );
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+inline
+void assert_shapes_are_equal(
+  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
+  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
+
+  if ( x != y ) {
+    assert_shapes_are_equal_throw(
+      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
+      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
+  }
+}
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+void assert_shapes_equal_dimension(
+  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
+  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
+
+  // Omit comparison of scalar_size.
+  if ( unsigned( x.rank ) != unsigned( y.rank ) ||
+       size_t( x.N0 )   != size_t( y.N0 ) || 
+       unsigned( x.N1 ) != unsigned( y.N1 ) || 
+       unsigned( x.N2 ) != unsigned( y.N2 ) || 
+       unsigned( x.N3 ) != unsigned( y.N3 ) ||
+       unsigned( x.N4 ) != unsigned( y.N4 ) || 
+       unsigned( x.N5 ) != unsigned( y.N5 ) || 
+       unsigned( x.N6 ) != unsigned( y.N6 ) || 
+       unsigned( x.N7 ) != unsigned( y.N7 ) ) {
+    assert_shapes_are_equal_throw(
+      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
+      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class ShapeType > struct assert_shape_is_rank_zero ;
+template< class ShapeType > struct assert_shape_is_rank_one ;
+
+template< unsigned Size >
+struct assert_shape_is_rank_zero< Shape<Size,0> >
+  : public true_type {};
+
+template< unsigned Size , unsigned s0 >
+struct assert_shape_is_rank_one< Shape<Size,1,s0> >
+  : public true_type {};
+
+//----------------------------------------------------------------------------
+
+/** \brief  Array bounds assertion templated on the execution space
+ *          to allow device-specific abort code.
+ */
+template< class Space >
+struct AssertShapeBoundsAbort ;
+
+template<>
+struct AssertShapeBoundsAbort< Kokkos::HostSpace >
+{
+  static void apply( const size_t rank ,
+                     const size_t n0 , const size_t n1 ,
+                     const size_t n2 , const size_t n3 ,
+                     const size_t n4 , const size_t n5 ,
+                     const size_t n6 , const size_t n7 ,
+                     const size_t arg_rank ,
+                     const size_t i0 , const size_t i1 ,
+                     const size_t i2 , const size_t i3 ,
+                     const size_t i4 , const size_t i5 ,
+                     const size_t i6 , const size_t i7 );
+};
+
+template< class ExecutionSpace >
+struct AssertShapeBoundsAbort
+{
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t rank ,
+                     const size_t n0 , const size_t n1 ,
+                     const size_t n2 , const size_t n3 ,
+                     const size_t n4 , const size_t n5 ,
+                     const size_t n6 , const size_t n7 ,
+                     const size_t arg_rank ,
+                     const size_t i0 , const size_t i1 ,
+                     const size_t i2 , const size_t i3 ,
+                     const size_t i4 , const size_t i5 ,
+                     const size_t i6 , const size_t i7 )
+    {
+      AssertShapeBoundsAbort< Kokkos::HostSpace >
+        ::apply( rank ,    n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 ,
+                 arg_rank, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+    }
+};
+
+template< class ShapeType >
+KOKKOS_INLINE_FUNCTION
+void assert_shape_bounds( const ShapeType & shape ,
+                          const size_t arg_rank ,
+                          const size_t i0 ,
+                          const size_t i1 = 0 ,
+                          const size_t i2 = 0 ,
+                          const size_t i3 = 0 ,
+                          const size_t i4 = 0 ,
+                          const size_t i5 = 0 ,
+                          const size_t i6 = 0 ,
+                          const size_t i7 = 0 )
+{
+  // Must supply at least as many indices as ranks.
+  // Every index must be within bounds.
+  const bool ok = ShapeType::rank <= arg_rank &&
+                  i0 < shape.N0 && 
+                  i1 < shape.N1 &&
+                  i2 < shape.N2 &&
+                  i3 < shape.N3 &&
+                  i4 < shape.N4 &&
+                  i5 < shape.N5 &&
+                  i6 < shape.N6 &&
+                  i7 < shape.N7 ;
+
+  if ( ! ok ) {
+    AssertShapeBoundsAbort< Kokkos::Impl::ActiveExecutionMemorySpace >
+      ::apply( ShapeType::rank ,
+               shape.N0 , shape.N1 , shape.N2 , shape.N3 ,
+               shape.N4 , shape.N5 , shape.N6 , shape.N7 ,
+               arg_rank , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+  }
+}
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) assert_shape_bounds(S,1,I0);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) assert_shape_bounds(S,2,I0,I1);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) assert_shape_bounds(S,3,I0,I1,I2);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) assert_shape_bounds(S,4,I0,I1,I2,I3);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) assert_shape_bounds(S,5,I0,I1,I2,I3,I4);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) assert_shape_bounds(S,6,I0,I1,I2,I3,I4,I5);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) assert_shape_bounds(S,7,I0,I1,I2,I3,I4,I5,I6);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) assert_shape_bounds(S,8,I0,I1,I2,I3,I4,I5,I6,I7);
+#else
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) /* */
+#endif
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Specialization and optimization for the Rank 0 shape.
+
+template < unsigned ScalarSize >
+struct Shape< ScalarSize , 0, 1,1,1,1, 1,1,1,1 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 0 };
+  enum { rank         = 0 };
+
+  enum { N0 = 1 };
+  enum { N1 = 1 };
+  enum { N2 = 1 };
+  enum { N3 = 1 };
+  enum { N4 = 1 };
+  enum { N5 = 1 };
+  enum { N6 = 1 };
+  enum { N7 = 1 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  {}
+};
+
+//----------------------------------------------------------------------------
+
+template< unsigned R > struct assign_shape_dimension ;
+
+#define KOKKOS_ASSIGN_SHAPE_DIMENSION( R ) \
+template<> \
+struct assign_shape_dimension< R > \
+{ \
+  template< class ShapeType > \
+  KOKKOS_INLINE_FUNCTION \
+  assign_shape_dimension( ShapeType & shape \
+                        , typename Impl::enable_if<( R < ShapeType::rank_dynamic ), size_t >::type n \
+                        ) { shape.N ## R = n ; } \
+};
+
+KOKKOS_ASSIGN_SHAPE_DIMENSION(0)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(1)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(2)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(3)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(4)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(5)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(6)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(7)
+
+#undef KOKKOS_ASSIGN_SHAPE_DIMENSION
+
+//----------------------------------------------------------------------------
+// All-static dimension array
+
+template < unsigned ScalarSize ,
+           unsigned Rank ,
+           unsigned s0 ,
+           unsigned s1 ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape {
+
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 0 };
+  enum { rank         = Rank };
+
+  enum { N0 = s0 };
+  enum { N1 = s1 };
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  {}
+};
+
+// 1 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize ,
+           unsigned Rank ,
+           unsigned s1 ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,s1,s2,s3, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 1 };
+  enum { rank         = Rank };
+
+  size_t N0 ; // For 1 == dynamic_rank allow  N0 > 2^32
+
+  enum { N1 = s1 };
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               size_t n0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; }
+};
+
+// 2 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,s2,s3, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 2 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; }
+};
+
+// 3 == dynamic_rank <= rank <= 8
+template < unsigned Rank , unsigned ScalarSize ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,s3, s4,s5,s6,s7>
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 3 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; }
+};
+
+// 4 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank, 0,0,0,0, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 4 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; }
+};
+
+// 5 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 5 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; s.N4 = n4 ; }
+};
+
+// 6 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 6 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 = 0 , unsigned = 0 , unsigned = 0 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ;
+  }
+};
+
+// 7 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,0,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 7 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 , unsigned n6 , unsigned = 0 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ;
+  }
+};
+
+// 8 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize >
+struct Shape< ScalarSize , 8 , 0,0,0,0, 0,0,0,0 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 8 };
+  enum { rank         = 8 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+  unsigned N7 ;
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ; s.N7 = n7 ;
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class ShapeType , unsigned N ,
+          unsigned R = ShapeType::rank_dynamic >
+struct ShapeInsert ;
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 0 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 N ,
+                 ShapeType::N0 ,
+                 ShapeType::N1 ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 1 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 N ,
+                 ShapeType::N1 ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 2 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 3 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 4 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 5 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 6 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 7 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N > type ;
+};
+
+//----------------------------------------------------------------------------
+
+template< class DstShape , class SrcShape ,
+          unsigned DstRankDynamic   = DstShape::rank_dynamic ,
+          bool     DstRankDynamicOK = unsigned(DstShape::rank_dynamic) >= unsigned(SrcShape::rank_dynamic) >
+struct ShapeCompatible { enum { value = false }; };
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 8 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 7 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 6 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 5 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 4 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 3 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 2 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 1 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 0 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N0) == unsigned(SrcShape::N0) &&
+                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< unsigned ScalarSize , unsigned Rank ,
+          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
+          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 ,
+          typename iType >
+KOKKOS_INLINE_FUNCTION
+size_t dimension( 
+  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape ,
+  const iType & r )
+{
+  return 0 == r ? shape.N0 : (
+         1 == r ? shape.N1 : (
+         2 == r ? shape.N2 : (
+         3 == r ? shape.N3 : (
+         4 == r ? shape.N4 : (
+         5 == r ? shape.N5 : (
+         6 == r ? shape.N6 : (
+         7 == r ? shape.N7 : 1 )))))));
+}
+
+template< unsigned ScalarSize , unsigned Rank ,
+          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
+          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 >
+KOKKOS_INLINE_FUNCTION
+size_t cardinality_count(
+  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape )
+{
+  return size_t(shape.N0) * shape.N1 * shape.N2 * shape.N3 *
+         shape.N4 * shape.N5 * shape.N6 * shape.N7 ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_CORESHAPE_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp
@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SINGLETON_HPP
+#define KOKKOS_SINGLETON_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cstddef>
+
+namespace Kokkos { namespace Impl {
+
+
+}} // namespace Kokkos::Impl
+
+#endif // KOKKOS_SINGLETON_HPP
--- a/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
@ -0,0 +1,79 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STATICASSERT_HPP
+#define KOKKOS_STATICASSERT_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template < bool , class T = void >
+struct StaticAssert ;
+
+template< class T >
+struct StaticAssert< true , T > {
+  typedef T type ;
+  static const bool value = true ;
+};
+
+template < class A , class B >
+struct StaticAssertSame ;
+
+template < class A >
+struct StaticAssertSame<A,A> { typedef A type ; };
+
+template < class A , class B >
+struct StaticAssertAssignable ;
+
+template < class A >
+struct StaticAssertAssignable<A,A> { typedef A type ; };
+
+template < class A >
+struct StaticAssertAssignable< const A , A > { typedef const A type ; };
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* KOKKOS_STATICASSERT_HPP */
+
+
--- a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TAGS_HPP
+#define KOKKOS_TAGS_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <Kokkos_Core_fwd.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+//----------------------------------------------------------------------------
+
+template<class ExecutionSpace, class MemorySpace>
+struct Device {
+  typedef ExecutionSpace execution_space;
+  typedef MemorySpace memory_space;
+  typedef Device<execution_space,memory_space> device_type;
+};
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class C , class Enable = void >
+struct is_memory_space : public bool_< false > {};
+
+template< class C , class Enable = void >
+struct is_execution_space : public bool_< false > {};
+
+template< class C , class Enable = void >
+struct is_execution_policy : public bool_< false > {};
+
+template< class C , class Enable = void >
+struct is_array_layout : public Impl::false_type {};
+
+template< class C , class Enable = void >
+struct is_memory_traits : public Impl::false_type {};
+
+
+template< class C >
+struct is_memory_space< C , typename Impl::enable_if_type< typename C::memory_space >::type >
+  : public bool_< Impl::is_same< C , typename C::memory_space >::value > {};
+
+template< class C >
+struct is_execution_space< C , typename Impl::enable_if_type< typename C::execution_space >::type >
+  : public bool_< Impl::is_same< C , typename C::execution_space >::value > {};
+
+template< class C >
+struct is_execution_policy< C , typename Impl::enable_if_type< typename C::execution_policy >::type >
+  : public bool_< Impl::is_same< C , typename C::execution_policy >::value > {};
+
+template< class C >
+struct is_array_layout< C , typename Impl::enable_if_type< typename C::array_layout >::type >
+  : public bool_< Impl::is_same< C , typename C::array_layout >::value > {};
+
+template< class C >
+struct is_memory_traits< C , typename Impl::enable_if_type< typename C::memory_traits >::type >
+  : public bool_< Impl::is_same< C , typename C::memory_traits >::value > {};
+
+
+//----------------------------------------------------------------------------
+
+template< class C , class Enable = void >
+struct is_space : public Impl::false_type {};
+
+template< class C >
+struct is_space< C
+                 , typename Impl::enable_if<(
+                     Impl::is_same< C , typename C::execution_space >::value ||
+                     Impl::is_same< C , typename C::memory_space    >::value ||
+                     Impl::is_same< C , Device<
+                                             typename C::execution_space,
+                                             typename C::memory_space> >::value
+                   )>::type
+                 >
+  : public Impl::true_type
+{
+  typedef typename C::execution_space  execution_space ;
+  typedef typename C::memory_space     memory_space ;
+
+  // The host_memory_space defines a space with host-resident memory.
+  // If the execution space's memory space is host accessible then use that execution space.
+  // else use the HostSpace.
+  typedef
+      typename Impl::if_c< Impl::is_same< memory_space , HostSpace >::value
+#ifdef KOKKOS_HAVE_CUDA
+                        || Impl::is_same< memory_space , CudaUVMSpace>::value
+                        || Impl::is_same< memory_space , CudaHostPinnedSpace>::value
+#endif
+                          , memory_space , HostSpace >::type
+      host_memory_space ;
+
+  // The host_execution_space defines a space which has access to HostSpace.
+  // If the execution space can access HostSpace then use that execution space.
+  // else use the DefaultHostExecutionSpace.
+#ifdef KOKKOS_HAVE_CUDA
+  typedef
+      typename Impl::if_c< Impl::is_same< execution_space , Cuda >::value
+                          , DefaultHostExecutionSpace , execution_space >::type
+      host_execution_space ;
+#else
+  typedef execution_space host_execution_space;
+#endif
+
+  typedef Device<host_execution_space,host_memory_space> host_mirror_space;
+};
+}
+}
+
+#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
@ -0,0 +1,115 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPLWALLTIME_HPP
+#define KOKKOS_IMPLWALLTIME_HPP
+
+#include <stddef.h>
+
+#ifdef _MSC_VER
+#undef KOKKOS_USE_LIBRT
+#include <gettimeofday.c>
+#else
+#ifdef KOKKOS_USE_LIBRT
+#include <ctime>
+#else
+#include <sys/time.h>
+#endif
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Time since construction */
+
+class Timer {
+private:
+  #ifdef KOKKOS_USE_LIBRT
+	struct timespec m_old;
+  #else
+	struct timeval m_old ;
+  #endif
+  Timer( const Timer & );
+  Timer & operator = ( const Timer & );
+public:
+
+  inline
+  void reset() {
+    #ifdef KOKKOS_USE_LIBRT
+	  clock_gettime(CLOCK_REALTIME, &m_old);
+    #else
+	  gettimeofday( & m_old , ((struct timezone *) NULL ) );
+    #endif
+  }
+
+  inline
+  ~Timer() {}
+
+  inline
+  Timer() { reset(); }
+
+  inline
+  double seconds() const
+  {
+    #ifdef KOKKOS_USE_LIBRT
+      struct timespec m_new;
+      clock_gettime(CLOCK_REALTIME, &m_new);
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 );
+    #else
+      struct timeval m_new ;
+
+      ::gettimeofday( & m_new , ((struct timezone *) NULL ) );
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 );
+    #endif
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
+
--- a/Show More
+++ b/Show More