git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@13581 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2015-07-14 19:55:01 +00:00
parent ca4fa347b3
commit 26b23a47cd
159 changed files with 0 additions and 67610 deletions
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -1,237 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDAEXEC_HPP
-#define KOKKOS_CUDAEXEC_HPP
-
-#include <string>
-#include <Kokkos_Parallel.hpp>
-#include <impl/Kokkos_Error.hpp>
-#include <Cuda/Kokkos_Cuda_abort.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-struct CudaTraits {
-  enum { WarpSize       = 32      /* 0x0020 */ };
-  enum { WarpIndexMask  = 0x001f  /* Mask for warpindex */ };
-  enum { WarpIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
-
-  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
-  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
-  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
-
-  enum { UpperBoundGridCount    = 65535 /* Hard upper bound */ };
-  enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
-  enum { ConstantMemoryUsage    = 0x008000 /* 32k bytes */ };
-  enum { ConstantMemoryCache    = 0x002000 /*  8k bytes */ };
-
-  typedef unsigned long
-    ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
-
-  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
-
-  KOKKOS_INLINE_FUNCTION static
-  CudaSpace::size_type warp_count( CudaSpace::size_type i )
-    { return ( i + WarpIndexMask ) >> WarpIndexShift ; }
-
-  KOKKOS_INLINE_FUNCTION static
-  CudaSpace::size_type warp_align( CudaSpace::size_type i )
-    {
-      enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
-      return ( i + WarpIndexMask ) & Mask ;
-    }
-};
-
-//----------------------------------------------------------------------------
-
-CudaSpace::size_type cuda_internal_maximum_warp_count();
-CudaSpace::size_type cuda_internal_maximum_grid_count();
-CudaSpace::size_type cuda_internal_maximum_shared_words();
-
-CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
-CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
-CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ )
-
-/** \brief  Access to constant memory on the device */
-#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-extern
-#endif
-__device__ __constant__
-Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
-kokkos_impl_cuda_constant_memory_buffer ;
-
-template< typename T >
-inline
-__device__
-T * kokkos_impl_cuda_shared_memory()
-{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-// See section B.17 of Cuda C Programming Guide Version 3.2
-// for discussion of
-//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
-// function qualifier which could be used to improve performance.
-//----------------------------------------------------------------------------
-// Maximize L1 cache and minimize shared memory:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
-// For 2.0 capability: 48 KB L1 and 16 KB shared
-//----------------------------------------------------------------------------
-
-template< class DriverType >
-__global__
-static void cuda_parallel_launch_constant_memory()
-{
-  const DriverType & driver =
-    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
-
-  driver();
-}
-
-template< class DriverType >
-__global__
-static void cuda_parallel_launch_local_memory( const DriverType driver )
-{
-  driver();
-}
-
-template < class DriverType ,
-           bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
-struct CudaParallelLaunch ;
-
-template < class DriverType >
-struct CudaParallelLaunch< DriverType , true > {
-
-  inline
-  CudaParallelLaunch( const DriverType & driver
-                    , const dim3       & grid
-                    , const dim3       & block
-                    , const int          shmem
-                    , const cudaStream_t stream = 0 )
-  {
-    if ( grid.x && ( block.x * block.y * block.z ) ) {
-
-      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
-           sizeof( DriverType ) ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
-      }
-
-      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
-      }
-      else if ( shmem ) {
-        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
-      } else {
-        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
-      }
-
-      // Copy functor to constant memory on the device
-      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
-
-      // Invoke the driver function on the device
-      cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
-
-#if defined( KOKKOS_EXPRESSION_CHECK )
-      Kokkos::Cuda::fence();
-#endif
-    }
-  }
-};
-
-template < class DriverType >
-struct CudaParallelLaunch< DriverType , false > {
-
-  inline
-  CudaParallelLaunch( const DriverType & driver
-                    , const dim3       & grid
-                    , const dim3       & block
-                    , const int          shmem
-                    , const cudaStream_t stream = 0 )
-  {
-    if ( grid.x && ( block.x * block.y * block.z ) ) {
-
-      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
-      }
-      else if ( shmem ) {
-        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
-      } else {
-        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
-      }
-
-      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
-
-#if defined( KOKKOS_EXPRESSION_CHECK )
-      Kokkos::Cuda::fence();
-#endif
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* defined( __CUDACC__ ) */
-
-#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -1,591 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <stdlib.h>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <Kokkos_Macros.hpp>
-
-/* only compile this file if CUDA is enabled for Kokkos */
-#ifdef KOKKOS_HAVE_CUDA
-
-#include <Kokkos_Cuda.hpp>
-#include <Kokkos_CudaSpace.hpp>
-
-#include <Cuda/Kokkos_Cuda_Internal.hpp>
-#include <impl/Kokkos_MemoryTracking.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-DeepCopy<CudaSpace,CudaSpace>::DeepCopy( void * dst , const void * src , size_t n )
-{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
-
-DeepCopy<CudaSpace,CudaSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
-{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.m_stream ) ); }
-
-DeepCopy<HostSpace,CudaSpace>::DeepCopy( void * dst , const void * src , size_t n )
-{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
-
-DeepCopy<HostSpace,CudaSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
-{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.m_stream ) ); }
-
-DeepCopy<CudaSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
-{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
-
-DeepCopy<CudaSpace,HostSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
-{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.m_stream ) ); }
-
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-class CudaMemoryTracking {
-public:
-
-  enum SpaceTag { CudaSpaceTag , CudaUVMSpaceTag , CudaHostPinnedSpaceTag };
-
-  struct Attribute {
-
-    Kokkos::Impl::cuda_texture_object_type m_tex_obj ;
-    int                                    m_tex_flag ;
-
-    Attribute() : m_tex_obj(0), m_tex_flag(0) {}
-
-    ~Attribute()
-      {
-        if ( m_tex_flag ) {
-          cudaDestroyTextureObject( m_tex_obj );
-          m_tex_obj  = 0 ;
-          m_tex_flag = 0 ;
-        }
-      }
-
-    cudaError create( void * const                  arg_alloc_ptr
-                    , size_t const                  arg_byte_size
-                    , cudaChannelFormatDesc const & arg_desc
-                    )
-    {
-      cudaError cuda_status = cudaSuccess ;
-
-      if ( 0 == m_tex_flag ) {
- 
-        cuda_status = cudaDeviceSynchronize();
-
-        if ( cudaSuccess == cuda_status ) {
-          struct cudaResourceDesc resDesc ;
-          struct cudaTextureDesc  texDesc ;
-
-          memset( & resDesc , 0 , sizeof(resDesc) );
-          memset( & texDesc , 0 , sizeof(texDesc) );
-
-          resDesc.resType                = cudaResourceTypeLinear ;
-          resDesc.res.linear.desc        = arg_desc ;
-          resDesc.res.linear.sizeInBytes = arg_byte_size ;
-          resDesc.res.linear.devPtr      = arg_alloc_ptr ;
-
-          cuda_status = cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL);
-        }
-
-        if ( cudaSuccess == cuda_status ) { cuda_status = cudaDeviceSynchronize(); }
-
-        if ( cudaSuccess == cuda_status ) { m_tex_flag = 1 ; }
-      }
-
-      return cuda_status ;
-    }
-  };
-
-  typedef          Kokkos::Impl::MemoryTracking< Attribute >         tracking_type ;
-  typedef typename Kokkos::Impl::MemoryTracking< Attribute >::Entry  entry_type ;
-
-  bool available() const
-    {
-#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
-      enum { UVM_available = true };
-#else
-      enum { UVM_available = false };
-#endif
-
-      return ( m_space_tag != CudaUVMSpaceTag ) || UVM_available ;
-    }
-
-private:
-
-  tracking_type   m_tracking ;
-  SpaceTag const  m_space_tag ;
-
-
-  cudaError cuda_malloc( void ** ptr , size_t byte_size ) const
-    {
-      cudaError result = cudaSuccess ;
-
-      switch( m_space_tag ) {
-      case CudaSpaceTag :
-        result = cudaMalloc( ptr , byte_size );
-        break ;
-      case CudaUVMSpaceTag :
-#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
-        result = cudaMallocManaged( ptr, byte_size, cudaMemAttachGlobal );
-#else
-        Kokkos::Impl::throw_runtime_exception( std::string("CUDA VERSION does not support UVM") );
-#endif
-        break ;
-      case CudaHostPinnedSpaceTag :
-        result = cudaHostAlloc( ptr , byte_size , cudaHostAllocDefault );
-        break ;
-      }
-
-      return result ;
-    }
-
-  cudaError cuda_free( void * ptr ) const
-    {
-      cudaError result = cudaSuccess ;
-
-      switch( m_space_tag ) {
-      case CudaSpaceTag :
-      case CudaUVMSpaceTag :
-        result = cudaFree( ptr );
-        break ;
-      case CudaHostPinnedSpaceTag :
-        result = cudaFreeHost( ptr );
-        break ;
-      }
-      return result ;
-    }
-
-public :
-
-  CudaMemoryTracking( const SpaceTag arg_tag , const char * const arg_label )
-    : m_tracking(  arg_label )
-    , m_space_tag( arg_tag )
-    {}
-
-  void print( std::ostream & oss , const std::string & lead ) const
-    { m_tracking.print( oss , lead ); }
-
-  const char * query_label( const void * ptr ) const
-    {
-      static const char error[] = "<NOT FOUND>" ;
-      entry_type * const entry = m_tracking.query( ptr );
-      return entry ? entry->label() : error ;
-    }
-
-  int count(const void * ptr) const {
-    entry_type * const entry = m_tracking.query( ptr );
-    return entry ? entry->count() : 0 ;
-  }
-
-  void * allocate( const std::string & label , const size_t byte_size )
-  {
-    void * ptr = 0 ;
-
-    if ( byte_size ) {
-
-      const bool ok_parallel = ! HostSpace::in_parallel();
-
-      cudaError cuda_status = cudaSuccess ;
-
-      if ( ok_parallel ) {
-
-        cuda_status = cudaDeviceSynchronize();
-
-        if ( cudaSuccess == cuda_status ) { cuda_status = CudaMemoryTracking::cuda_malloc( & ptr , byte_size ); }
-        if ( cudaSuccess == cuda_status ) { cuda_status = cudaDeviceSynchronize(); }
-      }
-
-      if ( ok_parallel && ( cudaSuccess == cuda_status ) ) {
-        m_tracking.insert( label , ptr , byte_size );
-      }
-      else {
-        std::ostringstream msg ;
-        msg << m_tracking.label()
-            << "::allocate( "
-            << label
-            << " , " << byte_size
-            << " ) FAILURE : " ;
-        if ( ! ok_parallel ) {
-          msg << "called within a parallel functor" ;
-        }
-        else {
-          msg << " CUDA ERROR \"" << cudaGetErrorString(cuda_status) << "\"" ;
-        }
-        Kokkos::Impl::throw_runtime_exception( msg.str() );
-      }
-    }
-
-    return ptr ;
-  }
-
-  void decrement( const void * ptr )
-  {
-    const bool ok_parallel = ! HostSpace::in_parallel();
-
-    cudaError cuda_status = cudaSuccess ;
-
-    if ( ok_parallel ) {
-
-      cuda_status = cudaDeviceSynchronize();
-
-      void * const alloc_ptr = ( cudaSuccess == cuda_status ) ? m_tracking.decrement( ptr ) : (void *) 0 ;
-
-      if ( alloc_ptr ) {
-        if ( cudaSuccess == cuda_status ) { cuda_status = CudaMemoryTracking::cuda_free( alloc_ptr ); }
-        if ( cudaSuccess == cuda_status ) { cuda_status = cudaDeviceSynchronize(); }
-      }
-    }
-
-    if ( ( ! ok_parallel ) || ( cudaSuccess != cuda_status ) ) {
-      std::ostringstream msg ;
-      msg << m_tracking.label() << "::decrement( " << ptr << " ) FAILURE : " ;
-      if ( ! ok_parallel ) {
-        msg << "called within a parallel functor" ;
-      }
-      else {
-        msg << " CUDA ERROR \"" << cudaGetErrorString(cuda_status) << "\"" ;
-      }
-      std::cerr << msg.str() << std::endl ;
-    }
-  }
-
-  void increment( const void * ptr )
-    {
-      const bool ok_parallel = ! HostSpace::in_parallel();
-
-      if ( ok_parallel ) {
-        m_tracking.increment( ptr );
-      }
-      else {
-        std::ostringstream msg ;
-        msg << m_tracking.label() << "::increment(" << ptr
-            << ") FAILURE :called within a parallel functor" ;
-        Kokkos::Impl::throw_runtime_exception( msg.str() );
-      }
-    }
-
-
-  inline
-  void texture_object_attach( const void * const            arg_ptr
-                            , const unsigned                arg_type_size
-                            , const cudaChannelFormatDesc & arg_desc
-                            , ::cudaTextureObject_t * const arg_tex_obj
-                            , void const           ** const arg_alloc_ptr
-                            , int                   * const arg_offset
-                            )
-    {
-      static const size_t max_array_len = 1 << 28 ;
-
-      *arg_tex_obj   = 0 ;
-      *arg_alloc_ptr = 0 ;
-      *arg_offset    = 0 ;
-
-      if ( arg_ptr ) {
-
-        // Can only create texture object on device architure 3.0 or better
-        const bool ok_dev_arch = 300 <= Cuda::device_arch();
-        const bool ok_parallel = ok_dev_arch && ! HostSpace::in_parallel();
-
-        entry_type * const entry = ok_parallel ? m_tracking.query( arg_ptr ) : (entry_type *) 0 ;
-
-        const size_t offset = entry ? ( reinterpret_cast<const char*>(arg_ptr) -
-                                        reinterpret_cast<const char*>(entry->m_alloc_ptr) ) : 0 ;
-
-        const bool ok_offset = entry     && ( 0 == ( offset % arg_type_size ) );
-        const bool ok_count  = ok_offset && ( entry->m_alloc_size / arg_type_size < max_array_len );
-
-        cudaError cuda_status = cudaSuccess ;
-
-        if ( ok_count ) {
-          cuda_status = entry->m_attribute.create( entry->m_alloc_ptr , entry->m_alloc_size , arg_desc );
-        }
-
-        if ( cudaSuccess == cuda_status ) {
-          *arg_tex_obj   = entry->m_attribute.m_tex_obj ;
-          *arg_alloc_ptr = entry->m_alloc_ptr ;
-          *arg_offset    = offset / arg_type_size ;
-        }
-        else {
-          std::ostringstream msg ;
-          msg << m_tracking.label()
-              << "::texture_object_attach(" << arg_ptr << ") FAILED :" ;
-          if ( ! ok_dev_arch ) {
-            msg << " cuda architecture " << Cuda::device_arch()
-                << " does not support texture objects" ;
-          }
-          else if ( ! ok_parallel ) {
-            msg << " called within a parallel functor" ;
-          }
-          else if ( 0 == entry ) {
-            msg << " pointer not tracked" ;
-          }
-          else if ( ! ok_offset ) {
-            msg << " pointer not properly aligned" ;
-          }
-          else if ( ! ok_count ) {
-            msg << " array too large for texture object" ;
-          }
-          else {
-            msg << " CUDA ERROR \"" << cudaGetErrorString(cuda_status) << "\"" ;
-          }
-          Kokkos::Impl::throw_runtime_exception( msg.str() );
-        }
-      }
-    }
-};
-
-//----------------------------------------------------------------------------
-
-CudaMemoryTracking &
-cuda_space_singleton()
-{
-  static CudaMemoryTracking s( CudaMemoryTracking::CudaSpaceTag , "Kokkos::CudaSpace");
-  return s ;
-}
-
-CudaMemoryTracking &
-cuda_uvm_space_singleton()
-{
-  static CudaMemoryTracking s( CudaMemoryTracking::CudaUVMSpaceTag , "Kokkos::CudaUVMSpace");
-  return s ;
-}
-
-CudaMemoryTracking &
-cuda_host_pinned_space_singleton()
-{
-  static CudaMemoryTracking s( CudaMemoryTracking::CudaHostPinnedSpaceTag , "Kokkos::CudaHostPinnedSpace");
-  return s ;
-}
-
-}
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-void * CudaSpace::allocate( const std::string & label , const size_t size )
-{
-  return Impl::cuda_space_singleton().allocate( label , size );
-}
-
-void CudaSpace::decrement( const void * ptr )
-{
-  Impl::cuda_space_singleton().decrement( ptr );
-}
-
-
-void CudaSpace::increment( const void * ptr )
-{
-  Impl::cuda_space_singleton().increment( ptr );
-}
-
-void CudaSpace::print_memory_view( std::ostream & oss )
-{
-  Impl::cuda_space_singleton().print( oss , std::string("  ") );
-}
-
-int CudaSpace::count( const void * ptr ) {
-  if ( ! HostSpace::in_parallel() ) {
-    return Impl::cuda_space_singleton().count(ptr);
-  }
-  else {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::CudaSpace::count called within a parallel functor") );
-    return -1;
-  }
-}
-
-std::string CudaSpace::query_label( const void * p )
-{
-  return std::string( Impl::cuda_space_singleton().query_label(p) );
-}
-
-void CudaSpace::texture_object_attach( const void * const              arg_ptr
-                                     , const unsigned                  arg_type_size
-                                     , ::cudaChannelFormatDesc const & arg_desc
-                                     , ::cudaTextureObject_t * const   arg_tex_obj
-                                     , void const           ** const   arg_alloc_ptr
-                                     , int                   * const   arg_offset
-                                     )
-{
-  Impl::cuda_space_singleton().texture_object_attach( arg_ptr , arg_type_size , arg_desc , arg_tex_obj , arg_alloc_ptr , arg_offset );
-}
-
-void CudaSpace::access_error()
-{
-  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
-
-  Kokkos::Impl::throw_runtime_exception( msg );
-}
-
-void CudaSpace::access_error( const void * const ptr )
-{
-  std::ostringstream msg ;
-  msg << "Kokkos::CudaSpace::access_error:" ;
-  msg << " attempt to access Cuda-data labeled(" ;
-  msg << query_label( ptr ) ;
-  msg << ") from non-Cuda execution" ;
-  Kokkos::Impl::throw_runtime_exception( msg.str() );
-}
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-bool CudaUVMSpace::available()
-{
-  return Impl::cuda_uvm_space_singleton().available();
-}
-
-void * CudaUVMSpace::allocate( const std::string & label , const size_t size )
-{
-  return Impl::cuda_uvm_space_singleton().allocate( label , size );
-}
-
-void CudaUVMSpace::decrement( const void * ptr )
-{
-  Impl::cuda_uvm_space_singleton().decrement( ptr );
-}
-
-
-void CudaUVMSpace::increment( const void * ptr )
-{
-  Impl::cuda_uvm_space_singleton().increment( ptr );
-}
-
-int CudaUVMSpace::count( const void * ptr ) {
-  if ( ! HostSpace::in_parallel() ) {
-    return Impl::cuda_uvm_space_singleton().count(ptr);
-  }
-  else {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::CudaUVMSpace::count called within a parallel functor") );
-    return -1;
-  }
-}
-
-void CudaUVMSpace::print_memory_view( std::ostream & oss )
-{
-  Impl::cuda_uvm_space_singleton().print( oss , std::string("  ") );
-}
-
-std::string CudaUVMSpace::query_label( const void * p )
-{
-  return std::string( Impl::cuda_uvm_space_singleton().query_label(p) );
-}
-
-void CudaUVMSpace::texture_object_attach( const void * const              arg_ptr
-                                        , const unsigned                  arg_type_size
-                                        , ::cudaChannelFormatDesc const & arg_desc
-                                        , ::cudaTextureObject_t * const   arg_tex_obj
-                                        , void const           ** const   arg_alloc_ptr
-                                        , int                   * const   arg_offset
-                                        )
-{
-  Impl::cuda_uvm_space_singleton().texture_object_attach( arg_ptr , arg_type_size , arg_desc , arg_tex_obj , arg_alloc_ptr , arg_offset );
-}
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-void * CudaHostPinnedSpace::allocate( const std::string & label , const size_t size )
-{
-  return Impl::cuda_host_pinned_space_singleton().allocate( label , size );
-}
-
-void CudaHostPinnedSpace::decrement( const void * ptr )
-{
-  Impl::cuda_host_pinned_space_singleton().decrement( ptr );
-}
-
-
-void CudaHostPinnedSpace::increment( const void * ptr )
-{
-  Impl::cuda_host_pinned_space_singleton().increment( ptr );
-}
-
-int CudaHostPinnedSpace::count( const void * ptr ) {
-  if ( ! HostSpace::in_parallel() ) {
-    return Impl::cuda_uvm_space_singleton().count(ptr);
-  }
-  else {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::CudaHostPinnedSpace::count called within a parallel functor") );
-    return -1;
-  }
-}
-
-void CudaHostPinnedSpace::print_memory_view( std::ostream & oss )
-{
-  Impl::cuda_host_pinned_space_singleton().print( oss , std::string("  ") );
-}
-
-std::string CudaHostPinnedSpace::query_label( const void * p )
-{
-  return std::string( Impl::cuda_host_pinned_space_singleton().query_label(p) );
-}
-
-} // namespace Kokkos
-
-#endif // KOKKOS_HAVE_CUDA
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -1,670 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-/*--------------------------------------------------------------------------*/
-/* Kokkos interfaces */
-
-#include <Kokkos_Core.hpp>
-
-/* only compile this file if CUDA is enabled for Kokkos */
-#ifdef KOKKOS_HAVE_CUDA
-
-#include <Cuda/Kokkos_Cuda_Internal.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-/*--------------------------------------------------------------------------*/
-/* Standard 'C' libraries */
-#include <stdlib.h>
-
-/* Standard 'C++' libraries */
-#include <vector>
-#include <iostream>
-#include <sstream>
-#include <string>
-
-#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-__device__ __constant__
-Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
-kokkos_impl_cuda_constant_memory_buffer ;
-#endif
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-namespace {
-
-__global__
-void query_cuda_kernel_arch( int * d_arch )
-{
-#if defined( __CUDA_ARCH__ )
-  *d_arch = __CUDA_ARCH__ ;
-#else
-  *d_arch = 0 ;
-#endif
-}
-
-/** Query what compute capability is actually launched to the device: */
-int cuda_kernel_arch()
-{
-  int * d_arch = 0 ;
-  cudaMalloc( (void **) & d_arch , sizeof(int) );
-  query_cuda_kernel_arch<<<1,1>>>( d_arch );
-  int arch = 0 ;
-  cudaMemcpy( & arch , d_arch , sizeof(int) , cudaMemcpyDefault );
-  cudaFree( d_arch );
-  return arch ;
-}
-
-bool cuda_launch_blocking()
-{
-  const char * env = getenv("CUDA_LAUNCH_BLOCKING");
-
-  if (env == 0) return false;
-
-  return atoi(env);
-}
-
-}
-
-void cuda_device_synchronize()
-{
-  static const bool launch_blocking = cuda_launch_blocking();
-
-  if (!launch_blocking) {
-    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
-  }
-}
-
-void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
-{
-  std::ostringstream out ;
-  out << name << " error: " << cudaGetErrorString(e);
-  if (file) {
-    out << " " << file << ":" << line;
-  }
-  throw_runtime_exception( out.str() );
-}
-
-//----------------------------------------------------------------------------
-// Some significant cuda device properties:
-//
-// cudaDeviceProp::name                : Text label for device
-// cudaDeviceProp::major               : Device major number
-// cudaDeviceProp::minor               : Device minor number
-// cudaDeviceProp::warpSize            : number of threads per warp
-// cudaDeviceProp::multiProcessorCount : number of multiprocessors
-// cudaDeviceProp::sharedMemPerBlock   : capacity of shared memory per block
-// cudaDeviceProp::totalConstMem       : capacity of constant memory
-// cudaDeviceProp::totalGlobalMem      : capacity of global memory
-// cudaDeviceProp::maxGridSize[3]      : maximum grid size
-
-//
-//  Section 4.4.2.4 of the CUDA Toolkit Reference Manual
-//
-// struct cudaDeviceProp {
-//   char name[256];
-//   size_t totalGlobalMem;
-//   size_t sharedMemPerBlock;
-//   int regsPerBlock;
-//   int warpSize;
-//   size_t memPitch;
-//   int maxThreadsPerBlock;
-//   int maxThreadsDim[3];
-//   int maxGridSize[3];
-//   size_t totalConstMem;
-//   int major;
-//   int minor;
-//   int clockRate;
-//   size_t textureAlignment;
-//   int deviceOverlap;
-//   int multiProcessorCount;
-//   int kernelExecTimeoutEnabled;
-//   int integrated;
-//   int canMapHostMemory;
-//   int computeMode;
-//   int concurrentKernels;
-//   int ECCEnabled;
-//   int pciBusID;
-//   int pciDeviceID;
-//   int tccDriver;
-//   int asyncEngineCount;
-//   int unifiedAddressing;
-//   int memoryClockRate;
-//   int memoryBusWidth;
-//   int l2CacheSize;
-//   int maxThreadsPerMultiProcessor;
-// };
-
-
-namespace {
-
-
-
-class CudaInternalDevices {
-public:
-  enum { MAXIMUM_DEVICE_COUNT = 8 };
-  struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
-  int                    m_cudaDevCount ;
-
-  CudaInternalDevices();
-
-  static const CudaInternalDevices & singleton();
-};
-
-CudaInternalDevices::CudaInternalDevices()
-{
-  // See 'cudaSetDeviceFlags' for host-device thread interaction
-  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
-
-  CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
-
-  for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
-    CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
-  }
-}
-
-const CudaInternalDevices & CudaInternalDevices::singleton()
-{
-  static CudaInternalDevices self ; return self ;
-}
-
-}
-
-//----------------------------------------------------------------------------
-
-class CudaInternal {
-private:
-
-  CudaInternal( const CudaInternal & );
-  CudaInternal & operator = ( const CudaInternal & );
-
-public:
-
-  typedef Cuda::size_type size_type ;
-
-  int         m_cudaDev ;
-  int         m_cudaArch ;
-  unsigned    m_maxWarpCount ;
-  unsigned    m_maxBlock ;
-  unsigned    m_maxSharedWords ;
-  size_type   m_scratchSpaceCount ;
-  size_type   m_scratchFlagsCount ;
-  size_type   m_scratchUnifiedCount ;
-  size_type   m_scratchUnifiedSupported ;
-  size_type   m_streamCount ;
-  size_type * m_scratchSpace ;
-  size_type * m_scratchFlags ;
-  size_type * m_scratchUnified ;
-  cudaStream_t * m_stream ;
-
-
-  static CudaInternal & singleton();
-
-  int verify_is_initialized( const char * const label ) const ;
-
-  int is_initialized() const
-    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
-
-  void initialize( int cuda_device_id , int stream_count );
-  void finalize();
-
-  void print_configuration( std::ostream & ) const ;
-
-  ~CudaInternal();
-
-  CudaInternal()
-    : m_cudaDev( -1 )
-    , m_cudaArch( -1 )
-    , m_maxWarpCount( 0 )
-    , m_maxBlock( 0 ) 
-    , m_maxSharedWords( 0 )
-    , m_scratchSpaceCount( 0 )
-    , m_scratchFlagsCount( 0 )
-    , m_scratchUnifiedCount( 0 )
-    , m_scratchUnifiedSupported( 0 )
-    , m_streamCount( 0 )
-    , m_scratchSpace( 0 )
-    , m_scratchFlags( 0 )
-    , m_scratchUnified( 0 )
-    , m_stream( 0 )
-    {}
-
-  size_type * scratch_space( const size_type size );
-  size_type * scratch_flags( const size_type size );
-  size_type * scratch_unified( const size_type size );
-};
-
-//----------------------------------------------------------------------------
-
-
-void CudaInternal::print_configuration( std::ostream & s ) const
-{
-  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
-
-#if defined( KOKKOS_HAVE_CUDA )
-    s << "macro  KOKKOS_HAVE_CUDA      : defined" << std::endl ;
-#endif
-#if defined( CUDA_VERSION )
-    s << "macro  CUDA_VERSION          = " << CUDA_VERSION
-      << " = version " << CUDA_VERSION / 1000
-      << "." << ( CUDA_VERSION % 1000 ) / 10
-      << std::endl ;
-#endif
-
-  for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) {
-    s << "Kokkos::Cuda[ " << i << " ] "
-      << dev_info.m_cudaProp[i].name
-      << " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor
-      << ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) 
-      << ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
-    if ( m_cudaDev == i ) s << " : Selected" ;
-    s << std::endl ;
-  }
-}
-
-//----------------------------------------------------------------------------
-
-CudaInternal::~CudaInternal()
-{
-  if ( m_stream ||
-       m_scratchSpace ||
-       m_scratchFlags ||
-       m_scratchUnified ) {
-    std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
-              << std::endl ;
-    std::cerr.flush();
-  }
-
-  m_cudaDev                 = -1 ;
-  m_cudaArch                = -1 ;
-  m_maxWarpCount            = 0 ;
-  m_maxBlock                = 0 ;
-  m_maxSharedWords          = 0 ;
-  m_scratchSpaceCount       = 0 ;
-  m_scratchFlagsCount       = 0 ;
-  m_scratchUnifiedCount     = 0 ;
-  m_scratchUnifiedSupported = 0 ;
-  m_streamCount             = 0 ;
-  m_scratchSpace            = 0 ;
-  m_scratchFlags            = 0 ;
-  m_scratchUnified          = 0 ;
-  m_stream                  = 0 ;
-}
-
-int CudaInternal::verify_is_initialized( const char * const label ) const
-{
-  if ( m_cudaDev < 0 ) {
-    std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" << std::endl ;
-  }
-  return 0 <= m_cudaDev ;
-}
-
-CudaInternal & CudaInternal::singleton()
-{
-  static CudaInternal self ;
-  return self ;
-}
-
-void CudaInternal::initialize( int cuda_device_id , int stream_count )
-{
-  enum { WordSize = sizeof(size_type) };
-
-  if ( ! HostSpace::execution_space::is_initialized() ) {
-    const std::string msg("Cuda::initialize ERROR : HostSpace::execution_space is not initialized");
-    throw_runtime_exception( msg );
-  }
-
-  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
-
-  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
-
-  const bool ok_id   = 0 <= cuda_device_id &&
-                            cuda_device_id < dev_info.m_cudaDevCount ;
-
-  // Need device capability 2.0 or better
-
-  const bool ok_dev = ok_id &&
-    ( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
-      0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
-
-  if ( ok_init && ok_dev ) {
-
-    const struct cudaDeviceProp & cudaProp =
-      dev_info.m_cudaProp[ cuda_device_id ];
-
-    m_cudaDev = cuda_device_id ;
-
-    CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) );
-    CUDA_SAFE_CALL( cudaDeviceReset() );
-    Kokkos::Impl::cuda_device_synchronize();
-
-    // Query what compute capability architecture a kernel executes:
-    m_cudaArch = cuda_kernel_arch();
-
-    if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) {
-      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
-                << ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 )
-                << " on device with compute capability "
-                << cudaProp.major << "." << cudaProp.minor
-                << " , this will likely reduce potential performance."
-                << std::endl ;
-    }
-
-    //----------------------------------
-    // Maximum number of warps,
-    // at most one warp per thread in a warp for reduction.
-
-    // HCE 2012-February :
-    // Found bug in CUDA 4.1 that sometimes a kernel launch would fail
-    // if the thread count == 1024 and a functor is passed to the kernel.
-    // Copying the kernel to constant memory and then launching with
-    // thread count == 1024 would work fine.
-    //
-    // HCE 2012-October :
-    // All compute capabilities support at least 16 warps (512 threads).
-    // However, we have found that 8 warps typically gives better performance.
-
-    m_maxWarpCount = 8 ;
-
-    // m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
-
-    if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) {
-      m_maxWarpCount = Impl::CudaTraits::WarpSize ;
-    }
-
-    m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ;
-
-    //----------------------------------
-    // Maximum number of blocks:
-
-    m_maxBlock = m_cudaArch < 300 ? 65535 : cudaProp.maxGridSize[0] ;
-
-    //----------------------------------
-
-    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
-
-    if ( ! m_scratchUnifiedSupported ) {
-      std::cout << "Kokkos::Cuda device "
-                << cudaProp.name << " capability "
-                << cudaProp.major << "." << cudaProp.minor
-                << " does not support unified virtual address space"
-                << std::endl ;
-    }
-
-    //----------------------------------
-    // Multiblock reduction uses scratch flags for counters
-    // and scratch space for partial reduction values.
-    // Allocate some initial space.  This will grow as needed.
-
-    {
-      const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ;
-
-      (void) scratch_unified( 16 * sizeof(size_type) );
-      (void) scratch_flags( reduce_block_count * 2  * sizeof(size_type) );
-      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
-    }
-    //----------------------------------
-
-    if ( stream_count ) {
-      m_stream = (cudaStream_t*) malloc( stream_count * sizeof(cudaStream_t) );
-      m_streamCount = stream_count ;
-      for ( size_type i = 0 ; i < m_streamCount ; ++i ) m_stream[i] = 0 ;
-    }
-  }
-  else {
-
-    std::ostringstream msg ;
-    msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ;
-
-    if ( ! ok_init ) {
-      msg << " : Already initialized" ;
-    }
-    if ( ! ok_id ) {
-      msg << " : Device identifier out of range "
-          << "[0.." << dev_info.m_cudaDevCount << "]" ;
-    }
-    else if ( ! ok_dev ) {
-      msg << " : Device " ;
-      msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
-      msg << "." ;
-      msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
-      msg << " has insufficient capability, required 2.0 or better" ;
-    }
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-}
-
-//----------------------------------------------------------------------------
-
-typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ;
-enum { sizeScratchGrain = sizeof(ScratchGrain) };
-
-
-Cuda::size_type *
-CudaInternal::scratch_flags( const Cuda::size_type size )
-{
-  if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
-
-    CudaSpace::decrement( m_scratchFlags );
-  
-    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
-
-    m_scratchFlags = (size_type *)
-      CudaSpace::allocate( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
-
-    CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
-  }
-
-  return m_scratchFlags ;
-}
-
-Cuda::size_type *
-CudaInternal::scratch_space( const Cuda::size_type size )
-{
-  if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
-
-    CudaSpace::decrement( m_scratchSpace );
-  
-    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
-
-    m_scratchSpace = (size_type *)
-      CudaSpace::allocate( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
-  }
-
-  return m_scratchSpace ;
-}
-
-Cuda::size_type *
-CudaInternal::scratch_unified( const Cuda::size_type size )
-{
-  if ( verify_is_initialized("scratch_unified") &&
-       m_scratchUnifiedSupported && m_scratchUnifiedCount * sizeScratchGrain < size ) {
-
-    CudaHostPinnedSpace::decrement( m_scratchUnified );
-
-    m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
-
-    m_scratchUnified = (size_type *)
-      CudaHostPinnedSpace::allocate( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
-  }
-
-  return m_scratchUnified ;
-}
-
-//----------------------------------------------------------------------------
-
-void CudaInternal::finalize()
-{
-  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
-
-    if ( m_stream ) {
-      for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
-        cudaStreamDestroy( m_stream[i] );
-        m_stream[i] = 0 ;
-      }
-      free( m_stream );
-    }
-
-    CudaSpace::decrement( m_scratchSpace );
-    CudaSpace::decrement( m_scratchFlags );
-    CudaHostPinnedSpace::decrement( m_scratchUnified );
-  
-    m_cudaDev             = -1 ;
-    m_maxWarpCount        = 0 ;
-    m_maxBlock            = 0 ; 
-    m_maxSharedWords      = 0 ;
-    m_scratchSpaceCount   = 0 ;
-    m_scratchFlagsCount   = 0 ;
-    m_scratchUnifiedCount = 0 ;
-    m_streamCount         = 0 ;
-    m_scratchSpace        = 0 ;
-    m_scratchFlags        = 0 ;
-    m_scratchUnified      = 0 ;
-    m_stream              = 0 ;
-  }
-}
-
-//----------------------------------------------------------------------------
-
-Cuda::size_type cuda_internal_maximum_warp_count()
-{ return CudaInternal::singleton().m_maxWarpCount ; }
-
-Cuda::size_type cuda_internal_maximum_grid_count()
-{ return CudaInternal::singleton().m_maxBlock ; }
-
-Cuda::size_type cuda_internal_maximum_shared_words()
-{ return CudaInternal::singleton().m_maxSharedWords ; }
-
-Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
-{ return CudaInternal::singleton().scratch_space( size ); }
-
-Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
-{ return CudaInternal::singleton().scratch_flags( size ); }
-
-Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
-{ return CudaInternal::singleton().scratch_unified( size ); }
-
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-Cuda::size_type Cuda::detect_device_count()
-{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
-
-int Cuda::is_initialized()
-{ return Impl::CudaInternal::singleton().is_initialized(); }
-
-void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
-{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); }
-
-std::vector<unsigned>
-Cuda::detect_device_arch()
-{
-  const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton();
-
-  std::vector<unsigned> output( s.m_cudaDevCount );
-
-  for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) {
-    output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ;
-  }
-
-  return output ;
-}
-
-Cuda::size_type Cuda::device_arch()
-{
-  const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ;
-
-  int dev_arch = 0 ;
-
-  if ( 0 <= dev_id ) {
-    const struct cudaDeviceProp & cudaProp =
-      Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ;
-
-    dev_arch = cudaProp.major * 100 + cudaProp.minor ;
-  }
-
-  return dev_arch ;
-}
-
-void Cuda::finalize()
-{ Impl::CudaInternal::singleton().finalize(); }
-
-Cuda::Cuda()
-  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
-  , m_stream( 0 )
-{
-  Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
-}
-
-Cuda::Cuda( const int instance_id )
-  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
-  , m_stream(
-      Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" )
-        ? Impl::CudaInternal::singleton().m_stream[ instance_id % Impl::CudaInternal::singleton().m_streamCount ]
-        : 0 )
-{}
-
-void Cuda::print_configuration( std::ostream & s , const bool )
-{ Impl::CudaInternal::singleton().print_configuration( s ); }
-
-bool Cuda::sleep() { return false ; }
-
-bool Cuda::wake() { return true ; }
-
-void Cuda::fence()
-{ 
-  Kokkos::Impl::cuda_device_synchronize();
-}
-
-} // namespace Kokkos
-
-#endif // KOKKOS_HAVE_CUDA
-//----------------------------------------------------------------------------
-
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@ -1,171 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_INTERNAL_HPP
-#define KOKKOS_CUDA_INTERNAL_HPP
-
-namespace Kokkos {
-namespace Impl {
-
-void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
-
-void cuda_device_synchronize();
-
-inline
-void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
-{
-  if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
-}
-
-template<class DriverType>
-int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
-#if ( CUDA_VERSION < 6050 )
-  return 256;
-#else
-  bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
-
-  int numBlocks;
-  if(Large) {
-    int blockSize=32;
-    int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &numBlocks,
-        cuda_parallel_launch_constant_memory<DriverType>,
-        blockSize,
-        sharedmem);
-
-    while (blockSize<1024 && numBlocks>0) {
-      blockSize*=2;
-      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks,
-          cuda_parallel_launch_constant_memory<DriverType>,
-          blockSize,
-          sharedmem);
-    }
-    if(numBlocks>0) return blockSize;
-    else return blockSize/2;
-  } else {
-    int blockSize=32;
-    int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &numBlocks,
-        cuda_parallel_launch_local_memory<DriverType>,
-        blockSize,
-        sharedmem);
-
-    while (blockSize<1024 && numBlocks>0) {
-      blockSize*=2;
-      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks,
-          cuda_parallel_launch_local_memory<DriverType>,
-          blockSize,
-          sharedmem);
-    }
-    if(numBlocks>0) return blockSize;
-    else return blockSize/2;
-  }
-#endif
-}
-
-template<class DriverType>
-int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
-#if ( CUDA_VERSION < 6050 )
-  return 256;
-#else
-  bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
-
-  int blockSize=16;
-  int numBlocks;
-  int sharedmem;
-  int maxOccupancy=0;
-  int bestBlockSize=0;
-
-  if(Large) {
-    while(blockSize<1024) {
-      blockSize*=2;
-
-      //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
-      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-              &numBlocks,
-              cuda_parallel_launch_constant_memory<DriverType>,
-              blockSize,
-              sharedmem);
-      if(maxOccupancy < numBlocks*blockSize) {
-        maxOccupancy = numBlocks*blockSize;
-        bestBlockSize = blockSize;
-      }
-    }
-  } else {
-    while(blockSize<1024) {
-      blockSize*=2;
-      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-              &numBlocks,
-              cuda_parallel_launch_local_memory<DriverType>,
-              blockSize,
-              sharedmem);
-
-      if(maxOccupancy < numBlocks*blockSize) {
-        maxOccupancy = numBlocks*blockSize;
-        bestBlockSize = blockSize;
-      }
-    }
-  }
-  return bestBlockSize;
-#endif
-}
-
-}
-}
-
-#define CUDA_SAFE_CALL( call )  \
-	Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
-
-#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */
-
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -1,421 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
-#define KOKKOS_CUDA_REDUCESCAN_HPP
-
-#if defined( __CUDACC__ )
-
-#include <utility>
-
-#include <Kokkos_Parallel.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
-#include <impl/Kokkos_Error.hpp>
-#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-
-
-//Shfl based reductions
-/*
- *  Algorithmic constraints:
- *   (a) threads with same threadIdx.y have same value
- *   (b) blockDim.x == power of two
- *   (c) blockDim.z == 1
- */
-
-template< class ValueType , class JoinOp>
-__device__
-inline void cuda_intra_warp_reduction( ValueType& result,
-                                       const JoinOp& join,
-                                       const int max_active_thread = blockDim.y) {
-
-  unsigned int shift = 1;
-
-  //Reduce over values from threads with different threadIdx.y
-  while(blockDim.x * shift < 32 ) {
-    const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
-    //Only join if upper thread is active (this allows non power of two for blockDim.y
-    if(threadIdx.y + shift < max_active_thread)
-      join(result , tmp);
-    shift*=2;
-  }
-
-  result = shfl(result,0,32);
-}
-
-template< class ValueType , class JoinOp>
-__device__
-inline void cuda_inter_warp_reduction( ValueType& value,
-                                       const JoinOp& join,
-                                       const int max_active_thread = blockDim.y) {
-
-  #define STEP_WIDTH 4
-  __shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
-  ValueType* result = (ValueType*) & sh_result;
-  const unsigned step = 32 / blockDim.x;
-  unsigned shift = STEP_WIDTH;
-  const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
-  if(id < STEP_WIDTH ) {
-    result[id] = value;
-  }
-  __syncthreads();
-  while (shift<=max_active_thread/step) {
-    if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
-      join(result[id%STEP_WIDTH],value);
-    }
-    __syncthreads();
-    shift+=STEP_WIDTH;
-  }
-
-
-  value = result[0];
-  for(int i = 1; (i*step<=max_active_thread) && i<STEP_WIDTH; i++)
-    join(value,result[i]);
-}
-
-template< class ValueType , class JoinOp>
-__device__
-inline void cuda_intra_block_reduction( ValueType& value,
-                                        const JoinOp& join,
-                                        const int max_active_thread = blockDim.y) {
-  cuda_intra_warp_reduction(value,join,max_active_thread);
-  cuda_inter_warp_reduction(value,join,max_active_thread);
-}
-
-template< class FunctorType , class JoinOp>
-__device__
-bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type  value,
-                                 const JoinOp& join,
-                                 Cuda::size_type * const m_scratch_space,
-                                 typename FunctorValueTraits< FunctorType , void >::pointer_type const result,
-                                 Cuda::size_type * const m_scratch_flags,
-                                 const int max_active_thread = blockDim.y) {
-  typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type;
-  typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type;
-
-  //Do the intra-block reduction with shfl operations and static shared memory
-  cuda_intra_block_reduction(value,join,max_active_thread);
-
-  const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
-
-  //One thread in the block writes block result to global scratch_memory
-  if(id == 0 ) {
-    pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
-    *global = value;
-  }
-
-  //One warp of last block performs inter block reduction through loading the block values from global scratch_memory
-  bool last_block = false;
-
-  __syncthreads();
-  if ( id < 32 ) {
-    Cuda::size_type count;
-
-    //Figure out whether this is the last block
-    if(id == 0)
-      count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
-    count = Kokkos::shfl(count,0,32);
-
-    //Last block does the inter block reduction
-    if( count == gridDim.x - 1) {
-      //set flag back to zero
-      if(id == 0)
-        *m_scratch_flags = 0;
-      last_block = true;
-      value = 0;
-
-      pointer_type const volatile global = (pointer_type) m_scratch_space ;
-
-      //Reduce all global values with splitting work over threads in one warp
-      const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
-      for(int i=id; i<gridDim.x; i+=step_size) {
-        value_type tmp = global[i];
-        join(value, tmp);
-      }
-
-      //Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
-      if (blockDim.x*blockDim.y > 1) {
-        value_type tmp = Kokkos::shfl_down(value, 1,32);
-        if( id + 1 < gridDim.x )
-          join(value, tmp);
-      }
-      if (blockDim.x*blockDim.y > 2) {
-        value_type tmp = Kokkos::shfl_down(value, 2,32);
-        if( id + 2 < gridDim.x )
-          join(value, tmp);
-      }
-      if (blockDim.x*blockDim.y > 4) {
-        value_type tmp = Kokkos::shfl_down(value, 4,32);
-        if( id + 4 < gridDim.x )
-          join(value, tmp);
-      }
-      if (blockDim.x*blockDim.y > 8) {
-        value_type tmp = Kokkos::shfl_down(value, 8,32);
-        if( id + 8 < gridDim.x )
-          join(value, tmp);
-      }
-      if (blockDim.x*blockDim.y > 16) {
-        value_type tmp = Kokkos::shfl_down(value, 16,32);
-        if( id + 16 < gridDim.x )
-          join(value, tmp);
-      }
-    }
-  }
-
-  //The last block has in its thread=0 the global reduction value through "value"
-  return last_block;
-}
-
-//----------------------------------------------------------------------------
-// See section B.17 of Cuda C Programming Guide Version 3.2
-// for discussion of
-//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
-// function qualifier which could be used to improve performance.
-//----------------------------------------------------------------------------
-// Maximize shared memory and minimize L1 cache:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
-// For 2.0 capability: 48 KB shared and 16 KB L1
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-/*
- *  Algorithmic constraints:
- *   (a) blockDim.y is a power of two
- *   (b) blockDim.y <= 512
- *   (c) blockDim.x == blockDim.z == 1
- */
-
-template< bool DoScan , class FunctorType , class ArgTag >
-__device__
-void cuda_intra_block_reduce_scan( const FunctorType & functor ,
-                                   const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
-{
-  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
-  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
-
-  typedef typename ValueTraits::pointer_type  pointer_type ;
-
-  const unsigned value_count   = ValueTraits::value_count( functor );
-  const unsigned BlockSizeMask = blockDim.y - 1 ;
-
-  // Must have power of two thread count
-
-  if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
-
-#define BLOCK_REDUCE_STEP( R , TD , S )  \
-  if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
-
-#define BLOCK_SCAN_STEP( TD , N , S )  \
-  if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
-
-  const unsigned     rtid_intra = threadIdx.y ^ BlockSizeMask ;
-  const pointer_type tdata_intra = base_data + value_count * threadIdx.y ;
-
-  { // Intra-warp reduction:
-    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
-    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
-    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
-    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
-    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
-  }
-
-  __syncthreads(); // Wait for all warps to reduce
-
-  { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
-    const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
-
-    if ( rtid_inter < blockDim.y ) {
-
-      const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
-
-      if ( (1<<5) < BlockSizeMask ) {                        BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
-      if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
-      if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
-      if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
-
-      if ( DoScan ) {
-
-        int n = ( rtid_inter &  32 ) ?  32 : (
-                ( rtid_inter &  64 ) ?  64 : (
-                ( rtid_inter & 128 ) ? 128 : (
-                ( rtid_inter & 256 ) ? 256 : 0 )));
-
-        if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
-
-        BLOCK_SCAN_STEP(tdata_inter,n,8)
-        BLOCK_SCAN_STEP(tdata_inter,n,7)
-        BLOCK_SCAN_STEP(tdata_inter,n,6)
-        BLOCK_SCAN_STEP(tdata_inter,n,5)
-      }
-    }
-  }
-
-  __syncthreads(); // Wait for inter-warp reduce-scan to complete
-
-  if ( DoScan ) {
-    int n = ( rtid_intra &  1 ) ?  1 : (
-            ( rtid_intra &  2 ) ?  2 : (
-            ( rtid_intra &  4 ) ?  4 : (
-            ( rtid_intra &  8 ) ?  8 : (
-            ( rtid_intra & 16 ) ? 16 : 0 ))));
-
-    if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
-
-    BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
-    BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
-    BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
-    BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
-    BLOCK_SCAN_STEP(tdata_intra,n,0)
-  }
-
-#undef BLOCK_SCAN_STEP
-#undef BLOCK_REDUCE_STEP
-}
-
-//----------------------------------------------------------------------------
-/**\brief  Input value-per-thread starting at 'shared_data'.
- *         Reduction value at last thread's location.
- *
- *  If 'DoScan' then write blocks' scan values and block-groups' scan values.
- *
- *  Global reduce result is in the last threads' 'shared_data' location.
- */
-template< bool DoScan , class FunctorType , class ArgTag >
-__device__
-bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
-                                          const Cuda::size_type   block_id ,
-                                          const Cuda::size_type   block_count ,
-                                          Cuda::size_type * const shared_data ,
-                                          Cuda::size_type * const global_data ,
-                                          Cuda::size_type * const global_flags )
-{
-  typedef Cuda::size_type                  size_type ;
-  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
-  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
-  typedef FunctorValueInit<   FunctorType , ArgTag >  ValueInit ;
-  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  const unsigned BlockSizeMask  = blockDim.y - 1 ;
-  const unsigned BlockSizeShift = power_of_two_if_valid( blockDim.y );
-
-  // Must have power of two thread count
-  if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); }
-
-  const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
-    word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
-
-  // Reduce the accumulation for the entire block.
-  cuda_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
-
-  {
-    // Write accumulation total to global scratch space.
-    // Accumulation total is the last thread's data.
-    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
-    size_type * const global = global_data + word_count.value * block_id ;
-
-    for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
-  }
-
-  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
-  // If this block is not the last block to contribute to this group then the block is done.
-  const bool is_last_block =
-    ! __syncthreads_or( threadIdx.y ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
-
-  if ( is_last_block ) {
-
-    const size_type b = ( long(block_count) * long(threadIdx.y) ) >> BlockSizeShift ;
-    const size_type e = ( long(block_count) * long( threadIdx.y + 1 ) ) >> BlockSizeShift ;
-
-    {
-      void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
-      reference_type shared_value = ValueInit::init( functor , shared_ptr );
-
-      for ( size_type i = b ; i < e ; ++i ) {
-        ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
-      }
-    }
-
-    cuda_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
-
-    if ( DoScan ) {
-
-      size_type * const shared_value = shared_data + word_count.value * ( threadIdx.y ? threadIdx.y - 1 : blockDim.y );
-
-      if ( ! threadIdx.y ) { ValueInit::init( functor , shared_value ); }
-
-      // Join previous inclusive scan value to each member
-      for ( size_type i = b ; i < e ; ++i ) {
-        size_type * const global_value = global_data + word_count.value * i ;
-        ValueJoin::join( functor , shared_value , global_value );
-        ValueOps ::copy( functor , global_value , shared_value );
-      }
-    }
-  }
-
-  return is_last_block ;
-}
-
-// Size in bytes required for inter block reduce or scan
-template< bool DoScan , class FunctorType , class ArgTag >
-inline
-unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
-{
-  return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #if defined( __CUDACC__ ) */
-#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */
-
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@ -1,291 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-#ifndef KOKKOS_CUDA_VECTORIZATION_HPP
-#define KOKKOS_CUDA_VECTORIZATION_HPP
-#include <Kokkos_Cuda.hpp>
-
-namespace Kokkos {
-
-
-// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
-// or other GPUs.  We provide a generic definition (which is trivial
-// and doesn't do what it claims to do) because we don't actually use
-// this function unless we are on a suitable GPU, with a suitable
-// Scalar type.  (For example, in the mat-vec, the "ThreadsPerRow"
-// internal parameter depends both on the ExecutionSpace and the Scalar type,
-// and it controls whether shfl_down() gets called.)
-namespace Impl {
-
-  template< typename Scalar >
-  struct shfl_union {
-    enum {n = sizeof(Scalar)/4};
-    float fval[n];
-    KOKKOS_INLINE_FUNCTION
-    Scalar value() {
-      return *(Scalar*) fval;
-    }
-    KOKKOS_INLINE_FUNCTION
-    void operator= (Scalar& value) {
-      float* const val_ptr = (float*) &value;
-      for(int i=0; i<n ; i++) {
-        fval[i] = val_ptr[i];
-      }
-    }
-    KOKKOS_INLINE_FUNCTION
-    void operator= (const Scalar& value) {
-      float* const val_ptr = (float*) &value;
-      for(int i=0; i<n ; i++) {
-        fval[i] = val_ptr[i];
-      }
-    }
-
-  };
-}
-
-#ifdef __CUDA_ARCH__
-  #if (__CUDA_ARCH__ >= 300)
-
-    KOKKOS_INLINE_FUNCTION
-    int shfl(const int &val, const int& srcLane, const int& width ) {
-      return __shfl(val,srcLane,width);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    float shfl(const float &val, const int& srcLane, const int& width ) {
-      return __shfl(val,srcLane,width);
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
-        ) {
-      Scalar tmp1 = val;
-      float tmp = *reinterpret_cast<float*>(&tmp1);
-      tmp = __shfl(tmp,srcLane,width);
-      return *reinterpret_cast<Scalar*>(&tmp);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    double shfl(const double &val, const int& srcLane, const int& width) {
-      int lo = __double2loint(val);
-      int hi = __double2hiint(val);
-      lo = __shfl(lo,srcLane,width);
-      hi = __shfl(hi,srcLane,width);
-      return __hiloint2double(hi,lo);
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
-      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
-      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
-      lo = __shfl(lo,srcLane,width);
-      hi = __shfl(hi,srcLane,width);
-      const double tmp = __hiloint2double(hi,lo);
-      return *(reinterpret_cast<const Scalar*>(&tmp));
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
-      Impl::shfl_union<Scalar> s_val;
-      Impl::shfl_union<Scalar> r_val;
-      s_val = val;
-
-      for(int i = 0; i<s_val.n; i++)
-        r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
-      return r_val.value();
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    int shfl_down(const int &val, const int& delta, const int& width) {
-      return __shfl_down(val,delta,width);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    float shfl_down(const float &val, const int& delta, const int& width) {
-      return __shfl_down(val,delta,width);
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
-      Scalar tmp1 = val;
-      float tmp = *reinterpret_cast<float*>(&tmp1);
-      tmp = __shfl_down(tmp,delta,width);
-      return *reinterpret_cast<Scalar*>(&tmp);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    double shfl_down(const double &val, const int& delta, const int& width) {
-      int lo = __double2loint(val);
-      int hi = __double2hiint(val);
-      lo = __shfl_down(lo,delta,width);
-      hi = __shfl_down(hi,delta,width);
-      return __hiloint2double(hi,lo);
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
-      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
-      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
-      lo = __shfl_down(lo,delta,width);
-      hi = __shfl_down(hi,delta,width);
-      const double tmp = __hiloint2double(hi,lo);
-      return *(reinterpret_cast<const Scalar*>(&tmp));
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
-      Impl::shfl_union<Scalar> s_val;
-      Impl::shfl_union<Scalar> r_val;
-      s_val = val;
-
-      for(int i = 0; i<s_val.n; i++)
-        r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
-      return r_val.value();
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    int shfl_up(const int &val, const int& delta, const int& width ) {
-      return __shfl_up(val,delta,width);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    float shfl_up(const float &val, const int& delta, const int& width ) {
-      return __shfl_up(val,delta,width);
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
-      Scalar tmp1 = val;
-      float tmp = *reinterpret_cast<float*>(&tmp1);
-      tmp = __shfl_up(tmp,delta,width);
-      return *reinterpret_cast<Scalar*>(&tmp);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    double shfl_up(const double &val, const int& delta, const int& width ) {
-      int lo = __double2loint(val);
-      int hi = __double2hiint(val);
-      lo = __shfl_up(lo,delta,width);
-      hi = __shfl_up(hi,delta,width);
-      return __hiloint2double(hi,lo);
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
-      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
-      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
-      lo = __shfl_up(lo,delta,width);
-      hi = __shfl_up(hi,delta,width);
-      const double tmp = __hiloint2double(hi,lo);
-      return *(reinterpret_cast<const Scalar*>(&tmp));
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
-      Impl::shfl_union<Scalar> s_val;
-      Impl::shfl_union<Scalar> r_val;
-      s_val = val;
-
-      for(int i = 0; i<s_val.n; i++)
-        r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
-      return r_val.value();
-    }
-
-  #else
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
-      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
-      return val;
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
-      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
-      return val;
-    }
-
-    template<typename Scalar>
-    KOKKOS_INLINE_FUNCTION
-    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
-      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
-      return val;
-    }
-  #endif
-#else
-    template<typename Scalar>
-    inline
-    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
-      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
-      return val;
-    }
-
-    template<typename Scalar>
-    inline
-    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
-      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
-      return val;
-    }
-
-    template<typename Scalar>
-    inline
-    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
-      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
-      return val;
-    }
-#endif
-
-
-
-}
-
-#endif
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -1,299 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_VIEW_HPP
-#define KOKKOS_CUDA_VIEW_HPP
-
-#include <cstring>
-
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_CudaSpace.hpp>
-#include <Kokkos_CudaTypes.hpp>
-#include <Kokkos_View.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template<>
-struct AssertShapeBoundsAbort< CudaSpace >
-{
-  KOKKOS_INLINE_FUNCTION
-  static void apply( const size_t /* rank */ ,
-                     const size_t /* n0 */ , const size_t /* n1 */ ,
-                     const size_t /* n2 */ , const size_t /* n3 */ ,
-                     const size_t /* n4 */ , const size_t /* n5 */ ,
-                     const size_t /* n6 */ , const size_t /* n7 */ ,
-
-                     const size_t /* arg_rank */ ,
-                     const size_t /* i0 */ , const size_t /* i1 */ ,
-                     const size_t /* i2 */ , const size_t /* i3 */ ,
-                     const size_t /* i4 */ , const size_t /* i5 */ ,
-                     const size_t /* i6 */ , const size_t /* i7 */ )
-    {
-      Kokkos::abort("Kokkos::View array bounds violation");
-    }
-};
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
-// to be an 'unsigned long long'.  This chould change with
-// future version of Cuda and this typedef would have to
-// change accordingly.
-
-#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
-
-typedef enable_if<
-  sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
-  ::cudaTextureObject_t >::type cuda_texture_object_type ;
-
-#else
-
-typedef const void * cuda_texture_object_type ;
-
-#endif
-
-//----------------------------------------------------------------------------
-// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
-// Via reinterpret_case this can be used to support all scalar types of those sizes.
-// Any other scalar type falls back to either normal reads out of global memory,
-// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
-
-template< typename ValueType
-        , class MemorySpace
-        , class AliasType =
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  4 ) , int ,
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  8 ) , int2 ,
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , int4 , void
-            >::type
-            >::type
-            >::type
-        >
-class CudaTextureFetch {
-private:
-
-  cuda_texture_object_type  m_obj ;
-  const ValueType         * m_alloc_ptr ;
-  int                       m_offset ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : m_obj(       rhs.m_obj )
-    , m_alloc_ptr( rhs.m_alloc_ptr )
-    , m_offset(    rhs.m_offset )
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    {
-      m_obj       = rhs.m_obj ;
-      m_alloc_ptr = rhs.m_alloc_ptr ;
-      m_offset    = rhs.m_offset ;
-      return *this ;
-    }
-
-
-  KOKKOS_INLINE_FUNCTION explicit
-  CudaTextureFetch( const ValueType * const arg_ptr )
-    : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
-    {
-#if defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
-      MemorySpace::texture_object_attach( arg_ptr
-                                        , sizeof(ValueType)
-                                        , cudaCreateChannelDesc< AliasType >()
-                                        , & m_obj
-                                        , reinterpret_cast<const void **>( & m_alloc_ptr )
-                                        , & m_offset
-                                        );
-#endif
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const ValueType * arg_ptr )
-    {
-#if defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
-      MemorySpace::texture_object_attach( arg_ptr
-                                        , sizeof(ValueType)
-                                        , cudaCreateChannelDesc< AliasType >()
-                                        , & m_obj
-                                        , reinterpret_cast<const void **>( & m_alloc_ptr )
-                                        , & m_offset
-                                        );
-#endif
-      return *this ;
-    }
-
-
-  KOKKOS_INLINE_FUNCTION
-  operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
-
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator[]( const iType & i ) const
-    {
-#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-#if defined( KOKKOS_USE_LDG_INTRINSIC )
-      // Enable the usage of the _ldg intrinsic even in cases where texture fetches work
-      // Currently texture fetches are faster, but that might change in the future
-      return _ldg( & m_alloc_ptr[i+m_offset] );
-#else /* ! defined( KOKKOS_USE_LDG_INTRINSIC ) */
-      AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
-
-      return  *(reinterpret_cast<ValueType*> (&v));
-#endif /* ! defined( KOKKOS_USE_LDG_INTRINSIC ) */
-#else  /* ! defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) */
-      return m_alloc_ptr[ i + m_offset ];
-#endif
-  }
-};
-
-template< typename ValueType >
-class CudaTextureFetch< const ValueType, void >
-{
-private:
-  const ValueType * m_ptr ;
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : m_ptr(0) {};
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
-    m_ptr = rhs.m_ptr;
-    return *this ;
-  }
-
-  explicit KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( ValueType * const base_view_ptr ) {
-    m_ptr = base_view_ptr;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
-    m_ptr = base_view_ptr;
-    return *this;
-  }
-
-
-  KOKKOS_INLINE_FUNCTION
-  operator const ValueType * () const { return m_ptr ; }
-
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator[]( const iType & i ) const
-  {
-  #if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-    return _ldg(&m_ptr[i]);
-  #else
-    return m_ptr[ i ];
-  #endif
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
- *          if 'const' value type, CudaSpace and random access.
- */
-template< class ViewTraits >
-class ViewDataHandle< ViewTraits ,
-  typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value ||
-                        is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value )
-                      &&
-                      is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value
-                      &&
-                      ViewTraits::memory_traits::RandomAccess
-                    >::type >
-{
-public:
-  enum { ReturnTypeIsReference = false };
-
-  typedef Impl::CudaTextureFetch< typename ViewTraits::value_type
-                                , typename ViewTraits::memory_space > handle_type;
-
-  typedef typename ViewTraits::value_type return_type;
-};
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
-
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
@ -1,117 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_ABORT_HPP
-#define KOKKOS_CUDA_ABORT_HPP
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
-
-#include <cuda.h>
-
-#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 )
-#error "Cuda version 4.1 or greater required"
-#endif
-
-#if ( __CUDA_ARCH__ < 200 )
-#error "Cuda device capability 2.0 or greater required"
-#endif
-
-extern "C" {
-/*  Cuda runtime function, declared in <crt/device_runtime.h>
- *  Requires capability 2.x or better.
- */
-extern __device__ void __assertfail(
-  const void  *message,
-  const void  *file,
-  unsigned int line,
-  const void  *function,
-  size_t       charsize);
-}
-
-namespace Kokkos {
-namespace Impl {
-
-__device__ inline
-void cuda_abort( const char * const message )
-{
-  const char empty[] = "" ;
-
-  __assertfail( (const void *) message ,
-                (const void *) empty ,
-                (unsigned int) 0 ,
-                (const void *) empty ,
-                sizeof(char) );
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-#else
-
-namespace Kokkos {
-namespace Impl {
-KOKKOS_INLINE_FUNCTION
-void cuda_abort( const char * const ) {}
-}
-}
-
-#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
-namespace Kokkos {
-__device__ inline
-void abort( const char * const message ) { Kokkos::Impl::cuda_abort(message); }
-}
-#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@ -1,236 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Kokkos_Atomic.hpp
-/// \brief Atomic functions
-///
-/// This header file defines prototypes for the following atomic functions:
-///   - exchange
-///   - compare and exchange
-///   - add
-///
-/// Supported types include:
-///   - signed and unsigned 4 and 8 byte integers
-///   - float
-///   - double
-///
-/// They are implemented through GCC compatible intrinsics, OpenMP
-/// directives and native CUDA intrinsics.
-///
-/// Including this header file requires one of the following
-/// compilers:
-///   - NVCC (for CUDA device code only)
-///   - GCC (for host code only)
-///   - Intel (for host code only)
-///   - A compiler that supports OpenMP 3.1 (for host code only)
-
-#ifndef KOKKOS_ATOMIC_HPP
-#define KOKKOS_ATOMIC_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <impl/Kokkos_Traits.hpp>
-
-//----------------------------------------------------------------------------
-
-#if defined( __CUDA_ARCH__ )
-
-// Compiling NVIDIA device code, must use Cuda atomics:
-
-#define KOKKOS_ATOMICS_USE_CUDA
-
-#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
-      ! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
-      ! defined( KOKKOS_ATOMICS_USE_OMP31 )
-
-// Compiling for non-Cuda atomic implementation has not been pre-selected.
-// Choose the best implementation for the detected compiler.
-// Preference: GCC, INTEL, OMP31
-
-#if defined( KOKKOS_COMPILER_GNU ) || \
-    defined( KOKKOS_COMPILER_CLANG )
-
-#define KOKKOS_ATOMICS_USE_GCC
-
-#elif defined( KOKKOS_COMPILER_INTEL ) || \
-      defined( KOKKOS_COMPILER_CRAYC )
-
-#define KOKKOS_ATOMICS_USE_INTEL
-
-#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
-
-#define KOKKOS_ATOMICS_USE_OMP31
-
-#else
-
-#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
-
-#endif
-
-#endif /* Not pre-selected atomic implementation */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-template <typename T>
-KOKKOS_INLINE_FUNCTION
-void atomic_add(volatile T * const dest, const T src);
-
-// Atomic increment
-template<typename T>
-KOKKOS_INLINE_FUNCTION
-void atomic_increment(volatile T* a);
-
-template<typename T>
-KOKKOS_INLINE_FUNCTION
-void atomic_decrement(volatile T* a);
-}
-
-
-#include<impl/Kokkos_Atomic_Assembly_X86.hpp>
-
-namespace Kokkos {
-
-
-inline
-const char * atomic_query_version()
-{
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-  return "KOKKOS_ATOMICS_USE_CUDA" ;
-#elif defined( KOKKOS_ATOMICS_USE_GCC )
-  return "KOKKOS_ATOMICS_USE_GCC" ;
-#elif defined( KOKKOS_ATOMICS_USE_INTEL )
-  return "KOKKOS_ATOMICS_USE_INTEL" ;
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-  return "KOKKOS_ATOMICS_USE_OMP31" ;
-#endif
-}
-
-} // namespace Kokkos
-
-//#include "impl/Kokkos_Atomic_Assembly_X86.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic exchange
-//
-// template< typename T >
-// T atomic_exchange( volatile T* const dest , const T val )
-// { T tmp = *dest ; *dest = val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Exchange.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic compare-and-exchange
-//
-// template<class T>
-// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
-// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; }
-
-#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic fetch and add
-//
-// template<class T>
-// T atomic_fetch_add(volatile T* const dest, const T val)
-// { T tmp = *dest ; *dest += val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic fetch and or
-//
-// template<class T>
-// T atomic_fetch_or(volatile T* const dest, const T val)
-// { T tmp = *dest ; *dest = tmp | val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Fetch_Or.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic fetch and and
-//
-// template<class T>
-// T atomic_fetch_and(volatile T* const dest, const T val)
-// { T tmp = *dest ; *dest = tmp & val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Fetch_And.hpp"
-
-//----------------------------------------------------------------------------
-// Memory fence
-//
-// All loads and stores from this thread will be globally consistent before continuing
-//
-// void memory_fence() {...};
-#include "impl/Kokkos_Memory_Fence.hpp"
-
-//----------------------------------------------------------------------------
-// Provide volatile_load and safe_load
-//
-// T volatile_load(T const volatile * const ptr);
-//
-// T const& safe_load(T const * const ptr);
-// XEON PHI
-// T safe_load(T const * const ptr
-
-#include "impl/Kokkos_Volatile_Load.hpp"
-
-#include "impl/Kokkos_Atomic_Generic.hpp"
-
-//----------------------------------------------------------------------------
-// This atomic-style macro should be an inlined function, not a macro
-
-#if defined( KOKKOS_COMPILER_GNU )
-
-  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
-  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)
-
-#else
-
-  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
-  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
-
-#endif
-
-//----------------------------------------------------------------------------
-
-#endif /* KOKKOS_ATOMIC_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -1,106 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CORE_HPP
-#define KOKKOS_CORE_HPP
-
-//----------------------------------------------------------------------------
-// Include the execution space header files for the enabled execution spaces.
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_HAVE_CUDA )
-#include <Kokkos_Cuda.hpp>
-#endif
-
-#if defined( KOKKOS_HAVE_OPENMP )
-#include <Kokkos_OpenMP.hpp>
-#endif
-
-#if defined( KOKKOS_HAVE_SERIAL )
-#include <Kokkos_Serial.hpp>
-#endif
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-#include <Kokkos_Threads.hpp>
-#endif
-
-#include <Kokkos_Pair.hpp>
-#include <Kokkos_View.hpp>
-#include <Kokkos_Vectorization.hpp>
-#include <Kokkos_Atomic.hpp>
-#include <Kokkos_hwloc.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-struct InitArguments {
-  int num_threads;
-  int num_numa;
-  int device_id;
-
-  InitArguments() {
-    num_threads = -1;
-    num_numa = -1;
-    device_id = -1;
-  }
-};
-
-void initialize(int& narg, char* arg[]);
-
-void initialize(const InitArguments& args = InitArguments());
-
-/** \brief  Finalize the spaces that were initialized via Kokkos::initialize */
-void finalize();
-
-/** \brief  Finalize all known execution spaces */
-void finalize_all();
-
-void fence();
-
-}
-
-#endif
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@ -1,150 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CORE_FWD_HPP
-#define KOKKOS_CORE_FWD_HPP
-
-//----------------------------------------------------------------------------
-// Kokkos_Macros.hpp does introspection on configuration options
-// and compiler environment then sets a collection of #define macros.
-
-#include <Kokkos_Macros.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-// Forward declarations for class inter-relationships
-
-namespace Kokkos {
-
-class HostSpace ; ///< Memory space for main process and CPU execution spaces
-
-#if defined( KOKKOS_HAVE_SERIAL )
-class Serial ;    ///< Execution space main process on CPU
-#endif // defined( KOKKOS_HAVE_SERIAL )
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-class Threads ;  ///< Execution space with pthreads back-end
-#endif
-
-#if defined( KOKKOS_HAVE_OPENMP )
-class OpenMP ; ///< OpenMP execution space
-#endif
-
-#if defined( KOKKOS_HAVE_CUDA )
-class CudaSpace ;            ///< Memory space on Cuda GPU
-class CudaUVMSpace ;         ///< Memory space on Cuda GPU with UVM
-class CudaHostPinnedSpace ;  ///< Memory space on Host accessible to Cuda GPU
-class Cuda ;                 ///< Execution space for Cuda GPU
-#endif
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-// Set the default execution space.
-
-/// Define Kokkos::DefaultExecutionSpace as per configuration option
-/// or chosen from the enabled execution spaces in the following order:
-/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
-
-namespace Kokkos {
-
-#if   defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
-  typedef Kokkos::Cuda DefaultExecutionSpace ;
-#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef OpenMP DefaultExecutionSpace ;
-#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Threads DefaultExecutionSpace ;
-#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
-  typedef Serial DefaultExecutionSpace ;
-#else
-#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
-#endif
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-// Detect the active execution space and define its memory space.
-// This is used to verify whether a running kernel can access
-// a given memory space.
-
-namespace Kokkos {
-namespace Impl {
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_HAVE_CUDA)
-typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace ;
-#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-typedef Kokkos::HostSpace  ActiveExecutionMemorySpace ;
-#else
-typedef void ActiveExecutionMemorySpace ;
-#endif
-
-template< class ActiveSpace , class MemorySpace >
-struct VerifyExecutionCanAccessMemorySpace {
-  enum {value = 0};
-};
-
-template< class Space >
-struct VerifyExecutionCanAccessMemorySpace< Space , Space >
-{
-  enum {value = 1};
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
-  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
-    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
-
-#define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
-  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
-    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
-
-#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_CrsArray.hpp
+++ b/lib/kokkos/core/src/Kokkos_CrsArray.hpp
@ -1,171 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CRSARRAY_HPP
-#define KOKKOS_CRSARRAY_HPP
-
-#include <string>
-#include <vector>
-
-#include <Kokkos_View.hpp>
-
-namespace Kokkos {
-
-/// \class CrsArray
-/// \brief Compressed row storage array.
-///
-/// \tparam DataType The type of stored entries.  If a CrsArray is
-///   used as the graph of a sparse matrix, then this is usually an
-///   integer type, the type of the column indices in the sparse
-///   matrix.
-///
-/// \tparam Arg1Type The second template parameter, corresponding
-///   either to the Space type (if there are no more template
-///   parameters) or to the Layout type (if there is at least one more
-///   template parameter).
-///
-/// \tparam Arg2Type The third template parameter, which if provided
-///   corresponds to the Space type.
-///
-/// \tparam SizeType The type of row offsets.  Usually the default
-///   parameter suffices.  However, setting a nondefault value is
-///   necessary in some cases, for example, if you want to have a
-///   sparse matrices with dimensions (and therefore column indices)
-///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
-///   entries in the sparse matrix.
-///
-/// A row has a range of entries:
-/// <ul>
-/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
-/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
-/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
-/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
-/// </ul>
-template< class DataType,
-          class Arg1Type,
-          class Arg2Type = void,
-          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
-class CrsArray {
-private:
-  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits ;
-
-public:
-  typedef DataType                                            data_type;
-  typedef typename traits::array_layout                       array_layout;
-  typedef typename traits::execution_space                    execution_space ;
-  typedef typename traits::memory_space                       memory_space ;
-  typedef SizeType                                            size_type;
-
-  typedef CrsArray< DataType , Arg1Type , Arg2Type , SizeType > crsarray_type;
-  typedef CrsArray< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror;
-  typedef View< const size_type* , array_layout, execution_space >  row_map_type;
-  typedef View<       DataType*  , array_layout, execution_space >  entries_type;
-
-  entries_type entries;
-  row_map_type row_map;
-
-  //! Construct an empty view.
-  CrsArray () : entries(), row_map() {}
-
-  //! Copy constructor (shallow copy).
-  CrsArray (const CrsArray& rhs) : entries (rhs.entries), row_map (rhs.row_map)
-  {}
-
-  /** \brief  Assign to a view of the rhs array.
-   *          If the old view is the last view
-   *          then allocated memory is deallocated.
-   */
-  CrsArray& operator= (const CrsArray& rhs) {
-    entries = rhs.entries;
-    row_map = rhs.row_map;
-    return *this;
-  }
-
-  /**  \brief  Destroy this view of the array.
-   *           If the last view then allocated memory is deallocated.
-   */
-  ~CrsArray() {}
-};
-
-//----------------------------------------------------------------------------
-
-template< class CrsArrayType , class InputSizeType >
-typename CrsArrayType::crsarray_type
-create_crsarray( const std::string & label ,
-                 const std::vector< InputSizeType > & input );
-
-template< class CrsArrayType , class InputSizeType >
-typename CrsArrayType::crsarray_type
-create_crsarray( const std::string & label ,
-                 const std::vector< std::vector< InputSizeType > > & input );
-
-//----------------------------------------------------------------------------
-
-template< class DataType ,
-          class Arg1Type ,
-          class Arg2Type ,
-          typename SizeType >
-typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & input );
-
-template< class DataType ,
-          class Arg1Type ,
-          class Arg2Type ,
-          typename SizeType >
-typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & input );
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#include <impl/Kokkos_CrsArray_factory.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_CRSARRAY_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@ -1,263 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_HPP
-#define KOKKOS_CUDA_HPP
-
-#include <Kokkos_Core_fwd.hpp>
-
-// If CUDA execution space is enabled then use this header file.
-
-#if defined( KOKKOS_HAVE_CUDA )
-
-#include <iosfwd>
-#include <vector>
-
-#include <Kokkos_CudaSpace.hpp>
-
-#include <Kokkos_Parallel.hpp>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_ScratchSpace.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-class CudaExec ;
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/// \class Cuda
-/// \brief Kokkos Execution Space that uses CUDA to run on GPUs.
-///
-/// An "execution space" represents a parallel execution model.  It tells Kokkos
-/// how to parallelize the execution of kernels in a parallel_for or
-/// parallel_reduce.  For example, the Threads execution space uses Pthreads or
-/// C++11 threads on a CPU, the OpenMP execution space uses the OpenMP language
-/// extensions, and the Serial execution space executes "parallel" kernels
-/// sequentially.  The Cuda execution space uses NVIDIA's CUDA programming
-/// model to execute kernels in parallel on GPUs.
-class Cuda {
-public:
-  //! \name Type declarations that all Kokkos execution spaces must provide.
-  //@{
-
-  //! Tag this class as a kokkos execution space
-  typedef Cuda                  execution_space ;
-
-#if defined( KOKKOS_USE_CUDA_UVM )
-  //! This execution space's preferred memory space.
-  typedef CudaUVMSpace          memory_space ;
-#else
-  //! This execution space's preferred memory space.
-  typedef CudaSpace             memory_space ;
-#endif
-
-  //! The size_type best suited for this execution space.
-  typedef memory_space::size_type  size_type ;
-
-  //! This execution space's preferred array layout.
-  typedef LayoutLeft            array_layout ;
-
-  //! For backward compatibility
-  typedef Cuda                  device_type ;
-  //! 
-  typedef ScratchMemorySpace< Cuda >  scratch_memory_space ;
-
-  //@}
-  //--------------------------------------------------
-  //! \name Functions that all Kokkos devices must implement.
-  //@{
-
-  /// \brief True if and only if this method is being called in a
-  ///   thread-parallel function.
-  KOKKOS_INLINE_FUNCTION static int in_parallel() {
-#if defined( __CUDA_ARCH__ )
-    return true;
-#else
-    return false;
-#endif
-  }
-
-  /** \brief  Set the device in a "sleep" state.
-   *
-   * This function sets the device in a "sleep" state in which it is
-   * not ready for work.  This may consume less resources than if the
-   * device were in an "awake" state, but it may also take time to
-   * bring the device from a sleep state to be ready for work.
-   *
-   * \return True if the device is in the "sleep" state, else false if
-   *   the device is actively working and could not enter the "sleep"
-   *   state.
-   */
-  static bool sleep();
-
-  /// \brief Wake the device from the 'sleep' state so it is ready for work.
-  ///
-  /// \return True if the device is in the "ready" state, else "false"
-  ///  if the device is actively working (which also means that it's
-  ///  awake).
-  static bool wake();
-
-  /// \brief Wait until all dispatched functors complete.
-  ///
-  /// The parallel_for or parallel_reduce dispatch of a functor may
-  /// return asynchronously, before the functor completes.  This
-  /// method does not return until all dispatched functors on this
-  /// device have completed.
-  static void fence();
-
-  //! Free any resources being consumed by the device.
-  static void finalize();
-
-  //! Has been initialized
-  static int is_initialized();
-
-  //! Print configuration information to the given output stream.
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  //@}
-  //--------------------------------------------------
-  //! \name  Cuda space instances
-
-  ~Cuda() {}
-  Cuda();
-  explicit Cuda( const int instance_id );
-
-#if defined( KOKKOS_HAVE_CXX11 )
-  Cuda & operator = ( const Cuda & ) = delete ;
-#else
-private:
-  Cuda & operator = ( const Cuda & );
-public:
-#endif
-
-  //--------------------------------------------------------------------------
-  //! \name Device-specific functions
-  //@{
-
-  struct SelectDevice {
-    int cuda_device_id ;
-    SelectDevice() : cuda_device_id(0) {}
-    explicit SelectDevice( int id ) : cuda_device_id( id ) {}
-  };
-
-  //! Initialize, telling the CUDA run-time library which device to use.
-  static void initialize( const SelectDevice = SelectDevice()
-                        , const size_t num_instances = 1 );
-
-  /// \brief Cuda device architecture of the selected device.
-  ///
-  /// This matches the __CUDA_ARCH__ specification.
-  static size_type device_arch();
-
-  //! Query device count.
-  static size_type detect_device_count();
-
-  /** \brief  Detect the available devices and their architecture
-   *          as defined by the __CUDA_ARCH__ specification.
-   */
-  static std::vector<unsigned> detect_device_arch();
-
-  //@}
-  //--------------------------------------------------------------------------
-
-  const cudaStream_t m_stream ;
-  const int          m_device ;
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-template<>
-struct VerifyExecutionCanAccessMemorySpace
-  < Kokkos::CudaSpace
-  , Kokkos::Cuda::scratch_memory_space
-  >
-{
-  enum { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
-  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
-};
-
-template<>
-struct VerifyExecutionCanAccessMemorySpace
-  < Kokkos::HostSpace
-  , Kokkos::Cuda::scratch_memory_space
-  >
-{
-  enum { value = false };
-  inline static void verify( void ) { CudaSpace::access_error(); }
-  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-#include <Cuda/Kokkos_CudaExec.hpp>
-#include <Cuda/Kokkos_Cuda_View.hpp>
-#include <Cuda/Kokkos_Cuda_Parallel.hpp>
-
-//----------------------------------------------------------------------------
-
-#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
-#endif /* #ifndef KOKKOS_CUDA_HPP */
-
-
-
--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@ -1,468 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDASPACE_HPP
-#define KOKKOS_CUDASPACE_HPP
-
-#if defined( KOKKOS_HAVE_CUDA )
-
-#include <iosfwd>
-#include <typeinfo>
-#include <string>
-
-#include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Cuda/Kokkos_Cuda_abort.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/** \brief  Cuda on-device memory management */
-
-class CudaSpace {
-public:
-
-  //! Tag this class as a kokkos memory space
-  typedef CudaSpace             memory_space ;
-  typedef Kokkos::Cuda          execution_space ;
-  typedef unsigned int          size_type ;
-
-  /** \brief  Allocate a contiguous block of memory on the Cuda device.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   *
-   *  Allocation may only occur on the master thread of the process.
-   */
-  static void * allocate( const std::string & label , const size_t size );
-
-  /** \brief  Increment the reference count of the block of memory
-   *          in which the input pointer resides.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void increment( const void * );
-
-  /** \brief  Decrement the reference count of the block of memory
-   *          in which the input pointer resides.  If the reference
-   *          count falls to zero the memory is deallocated.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void decrement( const void * );
-
-  /** \brief  Get the reference count of the block of memory
-   *          in which the input pointer resides.  If the reference
-   *          count is zero the memory region is not tracked.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static int count( const void * );
-
-  /** \brief  Print all tracked memory to the output stream. */
-  static void print_memory_view( std::ostream & );
-
-  /** \brief  Retrieve label associated with the input pointer */
-  static std::string query_label( const void * );
-
-  /*--------------------------------*/
-  /** \brief  Cuda specific function to attached texture object to an allocation.
-   *          Output the texture object, base pointer, and offset from the input pointer.
-   */
-#if defined( __CUDACC__ )
-  static void texture_object_attach( const void            * const arg_ptr
-                                   , const unsigned                arg_type_size
-                                   , ::cudaChannelFormatDesc const & arg_desc
-                                   , ::cudaTextureObject_t * const arg_tex_obj
-                                   , void const           ** const arg_alloc_ptr
-                                   , int                   * const arg_offset
-                                   );
-#endif
-
-  /*--------------------------------*/
-  /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
-  static void access_error();
-  static void access_error( const void * const );
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/** \brief  Cuda memory that is accessible to Host execution space
- *          through Cuda's unified virtual memory (UVM) runtime.
- */
-class CudaUVMSpace {
-public:
-
-  //! Tag this class as a kokkos memory space
-  typedef CudaUVMSpace          memory_space ;
-  typedef Cuda                  execution_space ;
-  typedef unsigned int          size_type ;
-
-  /** \brief  If UVM capability is available */
-  static bool available();
-
-  /** \brief  Allocate a contiguous block of memory on the Cuda device.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   *
-   *  Allocation may only occur on the master thread of the process.
-   */
-  static void * allocate( const std::string & label , const size_t size );
-
-  /** \brief  Increment the reference count of the block of memory
-   *          in which the input pointer resides.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void increment( const void * );
-
-  /** \brief  Decrement the reference count of the block of memory
-   *          in which the input pointer resides.  If the reference
-   *          count falls to zero the memory is deallocated.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void decrement( const void * );
-
-  /** \brief  Get the reference count of the block of memory
-   *          in which the input pointer resides.  If the reference
-   *          count is zero the memory region is not tracked.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static int count( const void * );
-
-  /** \brief  Print all tracked memory to the output stream. */
-  static void print_memory_view( std::ostream & );
-
-  /** \brief  Retrieve label associated with the input pointer */
-  static std::string query_label( const void * );
-
-  /** \brief  Cuda specific function to attached texture object to an allocation.
-   *          Output the texture object, base pointer, and offset from the input pointer.
-   */
-#if defined( __CUDACC__ )
-  static void texture_object_attach( const void            * const arg_ptr
-                                   , const unsigned                arg_type_size
-                                   , ::cudaChannelFormatDesc const & arg_desc
-                                   , ::cudaTextureObject_t * const arg_tex_obj
-                                   , void const           ** const arg_alloc_ptr
-                                   , int                   * const arg_offset
-                                   );
-#endif
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/** \brief  Host memory that is accessible to Cuda execution space
- *          through Cuda's host-pinned memory allocation.
- */
-class CudaHostPinnedSpace {
-public:
-
-  //! Tag this class as a kokkos memory space
-  typedef CudaHostPinnedSpace         memory_space ;
-  typedef unsigned int                size_type ;
-
-  /** \brief  Memory is in HostSpace so use the HostSpace::execution_space */
-  typedef HostSpace::execution_space  execution_space ;
-
-  /** \brief  Allocate a contiguous block of memory on the Cuda device.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   *
-   *  Allocation may only occur on the master thread of the process.
-   */
-  static void * allocate( const std::string & label , const size_t size );
-
-  /** \brief  Increment the reference count of the block of memory
-   *          in which the input pointer resides.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void increment( const void * );
-
-  /** \brief  Get the reference count of the block of memory
-   *          in which the input pointer resides.  If the reference
-   *          count is zero the memory region is not tracked.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static int count( const void * );
-
-  /** \brief  Decrement the reference count of the block of memory
-   *          in which the input pointer resides.  If the reference
-   *          count falls to zero the memory is deallocated.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void decrement( const void * );
-
-  /** \brief  Print all tracked memory to the output stream. */
-  static void print_memory_view( std::ostream & );
-
-  /** \brief  Retrieve label associated with the input pointer */
-  static std::string query_label( const void * );
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-template<> struct DeepCopy< CudaSpace , CudaSpace >
-{
-  DeepCopy( void * dst , const void * src , size_t );
-  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
-};
-
-template<> struct DeepCopy< CudaSpace , HostSpace >
-{
-  DeepCopy( void * dst , const void * src , size_t );
-  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
-};
-
-template<> struct DeepCopy< HostSpace , CudaSpace >
-{
-  DeepCopy( void * dst , const void * src , size_t );
-  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
-};
-
-template<> struct DeepCopy< CudaSpace , CudaUVMSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
-};
-
-template<> struct DeepCopy< CudaSpace , CudaHostPinnedSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
-};
-
-
-template<> struct DeepCopy< CudaUVMSpace , CudaSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
-};
-
-template<> struct DeepCopy< CudaUVMSpace , CudaUVMSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
-};
-
-template<> struct DeepCopy< CudaUVMSpace , CudaHostPinnedSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
-};
-
-template<> struct DeepCopy< CudaUVMSpace , HostSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
-};
-
-
-template<> struct DeepCopy< CudaHostPinnedSpace , CudaSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
-};
-
-template<> struct DeepCopy< CudaHostPinnedSpace , CudaUVMSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
-};
-
-template<> struct DeepCopy< CudaHostPinnedSpace , CudaHostPinnedSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
-};
-
-template<> struct DeepCopy< CudaHostPinnedSpace , HostSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
-};
-
-
-template<> struct DeepCopy< HostSpace , CudaUVMSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
-};
-
-template<> struct DeepCopy< HostSpace , CudaHostPinnedSpace >
-{
-  inline
-  DeepCopy( void * dst , const void * src , size_t n )
-  { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** Running in CudaSpace attempting to access HostSpace: error */
-template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::HostSpace >
-{
-  enum { value = false };
-  KOKKOS_INLINE_FUNCTION static void verify( void )
-    { Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
-
-  KOKKOS_INLINE_FUNCTION static void verify( const void * )
-    { Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
-};
-
-/** Running in CudaSpace accessing CudaUVMSpace: ok */
-template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaUVMSpace >
-{
-  enum { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
-  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
-};
-
-/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */
-template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace >
-{
-  enum { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
-  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
-};
-
-/** Running in CudaSpace attempting to access an unknown space: error */
-template< class OtherSpace >
-struct VerifyExecutionCanAccessMemorySpace<
-  typename enable_if< ! is_same<Kokkos::CudaSpace,OtherSpace>::value , Kokkos::CudaSpace >::type ,
-  OtherSpace >
-{
-  enum { value = false };
-  KOKKOS_INLINE_FUNCTION static void verify( void )
-    { Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
-
-  KOKKOS_INLINE_FUNCTION static void verify( const void * )
-    { Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
-};
-
-//----------------------------------------------------------------------------
-/** Running in HostSpace attempting to access CudaSpace */
-template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaSpace >
-{
-  enum { value = false };
-  inline static void verify( void ) { CudaSpace::access_error(); }
-  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
-};
-
-/** Running in HostSpace accessing CudaUVMSpace is OK */
-template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaUVMSpace >
-{
-  enum { value = true };
-  inline static void verify( void ) { }
-  inline static void verify( const void * ) { }
-};
-
-/** Running in HostSpace accessing CudaHostPinnedSpace is OK */
-template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >
-{
-  enum { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify( void ) {}
-  KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
-};
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
-#endif /* #define KOKKOS_CUDASPACE_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_CudaTypes.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaTypes.hpp
@ -1,139 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDATYPES_HPP
-#define KOKKOS_CUDATYPES_HPP
-
-#include <Kokkos_Macros.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ )
-
-namespace Kokkos {
-
-typedef ::int2 int2 ;
-typedef ::int3 int3 ;
-typedef ::int4 int4 ;
-
-typedef ::float2 float2 ;
-typedef ::float3 float3 ;
-typedef ::float4 float4 ;
-
-typedef ::double2 double2 ;
-typedef ::double3 double3 ;
-typedef ::double4 double4 ;
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#else /* NOT #if defined( __CUDACC__ ) */
-
-namespace Kokkos {
-
-struct int2 {
-        int x;
-        int y;
-};
-
-struct int3 {
-        int x;
-        int y;
-        int z;
-};
-
-struct int4 {
-        int x;
-        int y;
-        int z;
-        int w;
-};
-
-struct float2 {
-        float x;
-        float y;
-};
-
-struct float3 {
-        float x;
-        float y;
-        float z;
-};
-
-struct float4 {
-        float x;
-        float y;
-        float z;
-        float w;
-};
-
-struct double2 {
-        double x;
-        double y;
-};
-
-struct double3 {
-        double x;
-        double y;
-        double z;
-};
-
-struct double4 {
-        double x;
-        double y;
-        double z;
-        double w;
-};
-
-} // namespace Kokkos
-
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_CUDATYPES_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@ -1,439 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXECPOLICY_HPP
-#define KOKKOS_EXECPOLICY_HPP
-
-#include <Kokkos_Core_fwd.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_StaticAssert.hpp>
-#include <impl/Kokkos_Tags.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/** \brief  Execution policy for work over a range of an integral type.
- *
- * Valid template argument options:
- *
- *  With a specified execution space:
- *    < ExecSpace , WorkTag , { IntConst | IntType } >
- *    < ExecSpace , WorkTag , void >
- *    < ExecSpace , { IntConst | IntType } , void >
- *    < ExecSpace , void , void >
- *
- *  With the default execution space:
- *    < WorkTag , { IntConst | IntType } , void >
- *    < WorkTag , void , void >
- *    < { IntConst | IntType } , void , void >
- *    < void , void , void >
- *
- *  IntType  is a fundamental integral type
- *  IntConst is an Impl::integral_constant< IntType , Blocking >
- *
- *  Blocking is the granularity of partitioning the range among threads.
- */
-template< class Arg0 = void , class Arg1 = void , class Arg2 = void 
-        , class ExecSpace =
-          // The first argument is the execution space,
-          // otherwise use the default execution space.
-          typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
-                             , Kokkos::DefaultExecutionSpace >::type
-        >
-class RangePolicy {
-private:
-
-  // Default integral type and blocking factor:
-  typedef int DefaultIntType ;
-  enum { DefaultIntValue = 8 };
-
-  enum { Arg0_Void = Impl::is_same< Arg0 , void >::value };
-  enum { Arg1_Void = Impl::is_same< Arg1 , void >::value };
-  enum { Arg2_Void = Impl::is_same< Arg2 , void >::value };
-
-  enum { Arg0_ExecSpace = Impl::is_execution_space< Arg0 >::value };
-
-  enum { Arg0_IntConst = Impl::is_integral_constant< Arg0 >::value };
-  enum { Arg1_IntConst = Impl::is_integral_constant< Arg1 >::value };
-  enum { Arg2_IntConst = Impl::is_integral_constant< Arg2 >::value };
-
-  enum { Arg0_IntType = Impl::is_integral< Arg0 >::value };
-  enum { Arg1_IntType = Impl::is_integral< Arg1 >::value };
-  enum { Arg2_IntType = Impl::is_integral< Arg2 >::value };
-
-  enum { Arg0_WorkTag = ! Arg0_ExecSpace && ! Arg0_IntConst && ! Arg0_IntType && ! Arg0_Void };
-  enum { Arg1_WorkTag =   Arg0_ExecSpace && ! Arg1_IntConst && ! Arg1_IntType && ! Arg1_Void };
-
-  enum { ArgOption_OK = Impl::StaticAssert< (
-    ( Arg0_ExecSpace && Arg1_WorkTag && ( Arg2_IntConst || Arg2_IntType ) ) ||
-    ( Arg0_ExecSpace && Arg1_WorkTag && Arg2_Void ) ||
-    ( Arg0_ExecSpace && ( Arg1_IntConst || Arg1_IntType ) && Arg2_Void ) ||
-    ( Arg0_ExecSpace && Arg1_Void && Arg2_Void ) ||
-    ( Arg0_WorkTag && ( Arg1_IntConst || Arg1_IntType ) && Arg2_Void ) ||
-    ( Arg0_WorkTag && Arg1_Void && Arg2_Void ) ||
-    ( ( Arg0_IntConst || Arg0_IntType ) && Arg1_Void && Arg2_Void ) ||
-    ( Arg0_Void && Arg1_Void && Arg2_Void )
-    ) >::value };
-
-  // The work argument tag is the first or second argument
-  typedef typename Impl::if_c< Arg0_WorkTag , Arg0 ,
-          typename Impl::if_c< Arg1_WorkTag , Arg1 , void
-          >::type >::type
-    WorkTag ;
-
-  enum { Granularity = Arg0_IntConst ? unsigned(Impl::is_integral_constant<Arg0>::integral_value) : (
-                       Arg1_IntConst ? unsigned(Impl::is_integral_constant<Arg1>::integral_value) : (
-                       Arg2_IntConst ? unsigned(Impl::is_integral_constant<Arg2>::integral_value) : (
-                                       unsigned(DefaultIntValue) ))) };
-
-  // Only accept the integral type if the blocking is a power of two
-  typedef typename Impl::enable_if< Impl::is_power_of_two< Granularity >::value ,
-            typename Impl::if_c< Arg0_IntType , Arg0 ,
-            typename Impl::if_c< Arg1_IntType , Arg1 ,
-            typename Impl::if_c< Arg2_IntType , Arg2 ,
-            typename Impl::if_c< Arg0_IntConst , typename Impl::is_integral_constant<Arg0>::integral_type ,
-            typename Impl::if_c< Arg1_IntConst , typename Impl::is_integral_constant<Arg1>::integral_type ,
-            typename Impl::if_c< Arg2_IntConst , typename Impl::is_integral_constant<Arg2>::integral_type ,
-                                                 DefaultIntType
-            >::type >::type >::type
-            >::type >::type >::type
-          >::type
-    IntType ;
-
-  enum { GranularityMask = IntType(Granularity) - 1 };
-
-  ExecSpace m_space ;
-  IntType   m_begin ;
-  IntType   m_end ;
-
-public:
-
-  //! Tag this class as an execution policy
-  typedef ExecSpace    execution_space ;
-  typedef RangePolicy  execution_policy ;
-  typedef WorkTag      work_tag ;
-  typedef IntType      member_type ;
-
-  KOKKOS_INLINE_FUNCTION const execution_space & space() const { return m_space ; }
-  KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
-  KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
-
-  inline RangePolicy() : m_space(), m_begin(0), m_end(0) {}
-
-  /** \brief  Total range */
-  inline
-  RangePolicy( const member_type work_begin
-             , const member_type work_end
-             )
-    : m_space()
-    , m_begin( work_begin < work_end ? work_begin : 0 )
-    , m_end(   work_begin < work_end ? work_end : 0 )
-    {}
-
-  /** \brief  Total range */
-  inline
-  RangePolicy( const execution_space & work_space
-             , const member_type work_begin
-             , const member_type work_end
-             )
-    : m_space( work_space )
-    , m_begin( work_begin < work_end ? work_begin : 0 )
-    , m_end(   work_begin < work_end ? work_end : 0 )
-    {}
-
-  /** \brief  Subrange for a partition's rank and size.
-   *
-   *  Typically used to partition a range over a group of threads.
-   */
-  struct WorkRange {
-    typedef RangePolicy::work_tag     work_tag ;
-    typedef RangePolicy::member_type  member_type ;
-
-    KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
-    KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
-
-    /** \brief  Subrange for a partition's rank and size.
-     *
-     *  Typically used to partition a range over a group of threads.
-     */
-    KOKKOS_INLINE_FUNCTION
-    WorkRange( const RangePolicy & range
-             , const int part_rank
-             , const int part_size
-             )
-      : m_begin(0), m_end(0)
-      {
-        if ( part_size ) {
-  
-          // Split evenly among partitions, then round up to the granularity.
-          const member_type work_part =
-            ( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size )
-              + GranularityMask ) & ~member_type(GranularityMask);
-
-          m_begin = range.begin() + work_part * part_rank ;
-          m_end   = m_begin       + work_part ;
-  
-          if ( range.end() < m_begin ) m_begin = range.end() ;
-          if ( range.end() < m_end )   m_end   = range.end() ;
-        }
-      }
-  private:
-     member_type m_begin ;
-     member_type m_end ;
-     WorkRange();
-     WorkRange & operator = ( const WorkRange & );
-  };
-};
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/** \brief  Execution policy for parallel work over a league of teams of threads.
- *
- *  The work functor is called for each thread of each team such that
- *  the team's member threads are guaranteed to be concurrent.
- *
- *  The team's threads have access to team shared scratch memory and
- *  team collective operations.
- *
- *  If the WorkTag is non-void then the first calling argument of the
- *  work functor's parentheses operator is 'const WorkTag &'.
- *  This allows a functor to have multiple work member functions.
- *
- *  template argument option with specified execution space:
- *    < ExecSpace , WorkTag >
- *    < ExecSpace , void >
- *
- *  template argument option with default execution space:
- *    < WorkTag , void >
- *    < void , void >
- */
-template< class Arg0 = void
-        , class Arg1 = void
-        , class ExecSpace =
-          // If the first argument is not an execution
-          // then use the default execution space.
-          typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
-                             , Kokkos::DefaultExecutionSpace >::type
-        >
-class TeamPolicy {
-private:
-
-  enum { Arg0_ExecSpace = Impl::is_execution_space< Arg0 >::value };
-  enum { Arg1_Void      = Impl::is_same< Arg1 , void >::value };
-  enum { ArgOption_OK   = Impl::StaticAssert< ( Arg0_ExecSpace || Arg1_Void ) >::value };
-
-  typedef typename Impl::if_c< Arg0_ExecSpace , Arg1 , Arg0 >::type WorkTag ;
-
-public:
-
-  //! Tag this class as an execution policy
-  typedef TeamPolicy  execution_policy ;
-  typedef ExecSpace   execution_space ;
-  typedef WorkTag     work_tag ;
-
-  //----------------------------------------
-  /** \brief  Query maximum team size for a given functor.
-   *
-   *  This size takes into account execution space concurrency limitations and
-   *  scratch memory space limitations for reductions, team reduce/scan, and
-   *  team shared memory.
-   */
-  template< class FunctorType >
-  static int team_size_max( const FunctorType & );
-
-  /** \brief  Query recommended team size for a given functor.
-   *
-   *  This size takes into account execution space concurrency limitations and
-   *  scratch memory space limitations for reductions, team reduce/scan, and
-   *  team shared memory.
-   */
-  template< class FunctorType >
-  static int team_size_recommended( const FunctorType & );
-
-  //----------------------------------------
-  /** \brief  Construct policy with the given instance of the execution space */
-  TeamPolicy( const execution_space & , int league_size_request , int team_size_request );
-
-  /** \brief  Construct policy with the default instance of the execution space */
-  TeamPolicy( int league_size_request , int team_size_request );
-
-  /** \brief  The actual league size (number of teams) of the policy.
-   *
-   *  This may be smaller than the requested league size due to limitations
-   *  of the execution space.
-   */
-  KOKKOS_INLINE_FUNCTION int league_size() const ;
-
-  /** \brief  The actual team size (number of threads per team) of the policy.
-   *
-   *  This may be smaller than the requested team size due to limitations
-   *  of the execution space.
-   */
-  KOKKOS_INLINE_FUNCTION int team_size() const ;
-
-  /** \brief  Parallel execution of a functor calls the functor once with
-   *          each member of the execution policy.
-   */
-  struct member_type {
-
-    /** \brief  Handle to the currently executing team shared scratch memory */
-    KOKKOS_INLINE_FUNCTION
-    typename execution_space::scratch_memory_space team_shmem() const ;
-
-    /** \brief  Rank of this team within the league of teams */
-    KOKKOS_INLINE_FUNCTION int league_rank() const ;
-
-    /** \brief  Number of teams in the league */
-    KOKKOS_INLINE_FUNCTION int league_size() const ;
-
-    /** \brief  Rank of this thread within this team */
-    KOKKOS_INLINE_FUNCTION int team_rank() const ;
-
-    /** \brief  Number of threads in this team */
-    KOKKOS_INLINE_FUNCTION int team_size() const ;
-
-    /** \brief  Barrier among the threads of this team */
-    KOKKOS_INLINE_FUNCTION void team_barrier() const ;
-
-    /** \brief  Intra-team reduction. Returns join of all values of the team members. */
-    template< class JoinOp >
-    KOKKOS_INLINE_FUNCTION
-    typename JoinOp::value_type team_reduce( const typename JoinOp::value_type
-                                           , const JoinOp & ) const ;
-
-    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-     *
-     *  The highest rank thread can compute the reduction total as
-     *    reduction_total = dev.team_scan( value ) + value ;
-     */
-    template< typename Type >
-    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const ;
-
-    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-     *          with intra-team non-deterministic ordering accumulation.
-     *
-     *  The global inter-team accumulation value will, at the end of the
-     *  league's parallel execution, be the scan's total.
-     *  Parallel execution ordering of the league's teams is non-deterministic.
-     *  As such the base value for each team's scan operation is similarly
-     *  non-deterministic.
-     */
-    template< typename Type >
-    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const ;
-  };
-};
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-namespace Impl {
-  template<typename iType, class TeamMemberType>
-  struct TeamThreadLoopBoundariesStruct {
-    typedef iType index_type;
-    const iType start;
-    const iType end;
-    enum {increment = 1};
-    const TeamMemberType& thread;
-
-    KOKKOS_INLINE_FUNCTION
-    TeamThreadLoopBoundariesStruct (const TeamMemberType& thread_, const iType& count):
-      start( ( (count + thread_.team_size()-1) / thread_.team_size() ) * thread_.team_rank() ),
-      end(   ( (count + thread_.team_size()-1) / thread_.team_size() ) * ( thread_.team_rank() + 1 ) <= count?
-             ( (count + thread_.team_size()-1) / thread_.team_size() ) * ( thread_.team_rank() + 1 ):count),
-      thread(thread_)
-    {}
-  };
-
-  template<typename iType, class TeamMemberType>
-  struct ThreadVectorLoopBoundariesStruct {
-    typedef iType index_type;
-    enum {start = 0};
-    const iType end;
-    enum {increment = 1};
-
-    KOKKOS_INLINE_FUNCTION
-    ThreadVectorLoopBoundariesStruct (const TeamMemberType& thread, const iType& count):
-      end( count )
-    {}
-  };
-
-  template<class TeamMemberType>
-  struct ThreadSingleStruct {
-    const TeamMemberType& team_member;
-    KOKKOS_INLINE_FUNCTION
-    ThreadSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
-  };
-
-  template<class TeamMemberType>
-  struct VectorSingleStruct {
-    const TeamMemberType& team_member;
-    KOKKOS_INLINE_FUNCTION
-    VectorSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
-  };
-} // namespace Impl
-
-/*template<typename iType, class TeamMemberType>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadLoopBoundariesStruct<iType,TeamMemberType>
-  TeamThreadLoop(TeamMemberType thread, const iType count);
-
-template<typename iType, class TeamMemberType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorLoopBoundariesStruct<iType,TeamMemberType>
-  ThreadVectorLoop(TeamMemberType thread, const iType count);*/
-
-
-} // namespace Kokkos
-
-#endif /* #define KOKKOS_EXECPOLICY_HPP */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@ -1,161 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_HOSTSPACE_HPP
-#define KOKKOS_HOSTSPACE_HPP
-
-#include <iosfwd>
-#include <typeinfo>
-#include <string>
-
-#include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/// \class HostSpace
-/// \brief Memory management for host memory.
-///
-/// HostSpace is a memory space that governs host memory.  "Host"
-/// memory means the usual CPU-accessible memory.
-class HostSpace {
-public:
-
-  //! Tag this class as a kokkos memory space
-  typedef HostSpace  memory_space ;
-  typedef size_t     size_type ;
-
-  /// \typedef execution_space
-  /// \brief Default execution space for this memory space.
-  ///
-  /// Every memory space has a default execution space.  This is
-  /// useful for things like initializing a View (which happens in
-  /// parallel using the View's default execution space).
-#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
-#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Kokkos::Threads  execution_space ;
-#elif defined( KOKKOS_HAVE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
-#elif defined( KOKKOS_HAVE_PTHREAD )
-  typedef Kokkos::Threads  execution_space ;
-#elif defined( KOKKOS_HAVE_SERIAL )
-  typedef Kokkos::Serial   execution_space ;
-#else
-#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
-#endif
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   *
-   *  Allocation may only occur on the master thread of the process.
-   */
-  static void * allocate( const std::string & label , const size_t size );
-
-  /** \brief  Increment the reference count of the block of memory
-   *          in which the input pointer resides.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void increment( const void * );
-
-  /** \brief  Decrement the reference count of the block of memory
-   *          in which the input pointer resides.  If the reference
-   *          count falls to zero the memory is deallocated.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void decrement( const void * );
-
-  /** \brief  Get the reference count of the block of memory
-   *          in which the input pointer resides.  If the reference
-   *          count is zero the memory region is not tracked.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static int count( const void * );
-
-  /*--------------------------------*/
-
-  /** \brief  Print all tracked memory to the output stream. */
-  static void print_memory_view( std::ostream & );
-
-  /** \brief  Retrieve label associated with the input pointer */
-  static std::string query_label( const void * );
-
-  /*--------------------------------*/
-  /* Functions unique to the HostSpace */
-
-  static int in_parallel();
-
-  static void register_in_parallel( int (*)() );
-};
-
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class , class > struct DeepCopy ;
-
-template<>
-struct DeepCopy<HostSpace,HostSpace> {
-  DeepCopy( void * dst , const void * src , size_t n );
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* #define KOKKOS_HOSTSPACE_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_Layout.hpp
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@ -1,176 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Kokkos_Layout.hpp
-/// \brief Declaration of various \c MemoryLayout options.
-
-#ifndef KOKKOS_LAYOUT_HPP
-#define KOKKOS_LAYOUT_HPP
-
-#include <stddef.h>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-/// \struct LayoutLeft
-/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
-///   striding of multi-indices.
-///
-/// This is an example of a \c MemoryLayout template parameter of
-/// View.  The memory layout describes how View maps from a
-/// multi-index (i0, i1, ..., ik) to a memory location.  
-///
-/// "Layout left" indicates a mapping where the leftmost index i0
-/// refers to contiguous access, and strides increase for dimensions
-/// going right from there (i1, i2, ...).  This layout imitates how
-/// Fortran stores multi-dimensional arrays.  For the special case of
-/// a two-dimensional array, "layout left" is also called "column
-/// major."
-struct LayoutLeft {
-  //! Tag this class as a kokkos array layout
-  typedef LayoutLeft array_layout ;
-};
-
-//----------------------------------------------------------------------------
-/// \struct LayoutRight
-/// \brief Memory layout tag indicating right-to-left (C or
-///   lexigraphical scheme) striding of multi-indices.
-///
-/// This is an example of a \c MemoryLayout template parameter of
-/// View.  The memory layout describes how View maps from a
-/// multi-index (i0, i1, ..., ik) to a memory location.  
-///
-/// "Right layout" indicates a mapping where the rightmost index ik
-/// refers to contiguous access, and strides increase for dimensions
-/// going left from there.  This layout imitates how C stores
-/// multi-dimensional arrays.  For the special case of a
-/// two-dimensional array, "layout right" is also called "row major."
-struct LayoutRight {
-  //! Tag this class as a kokkos array layout
-  typedef LayoutRight array_layout ;
-};
-
-//----------------------------------------------------------------------------
-/// \struct LayoutStride
-/// \brief  Memory layout tag indicated arbitrarily strided
-///         multi-index mapping into contiguous memory.
-struct LayoutStride {
-
-  //! Tag this class as a kokkos array layout
-  typedef LayoutStride array_layout ;
-
-  enum { MAX_RANK = 8 };
-
-  size_t dimension[ MAX_RANK ] ;
-  size_t stride[ MAX_RANK ] ; 
-
-  /** \brief  Compute strides from ordered dimensions.
-   *
-   *  Values of order uniquely form the set [0..rank)
-   *  and specify ordering of the dimensions.
-   *  Order = {0,1,2,...} is LayoutLeft
-   *  Order = {...,2,1,0} is LayoutRight
-   */
-  template< typename iTypeOrder , typename iTypeDimen >
-  KOKKOS_INLINE_FUNCTION static
-  LayoutStride order_dimensions( int const rank
-                               , iTypeOrder const * const order
-                               , iTypeDimen const * const dimen )
-    {
-      LayoutStride tmp ;
-      // Verify valid rank order:
-      int check_input = MAX_RANK < rank ? 0 : int( 1 << rank ) - 1 ;
-      for ( int r = 0 ; r < MAX_RANK ; ++r ) {
-        tmp.dimension[r] = 0 ;
-        tmp.stride[r]    = 0 ;
-        check_input &= ~int( 1 << order[r] );
-      }
-      if ( 0 == check_input ) {
-        size_t n = 1 ;
-        for ( int r = 0 ; r < rank ; ++r ) {
-          tmp.stride[ order[r] ] = n ;
-          n *= ( dimen[order[r]] );
-          tmp.dimension[r] = dimen[r];
-        }
-      }
-      return tmp ;
-    }
-};
-
-//----------------------------------------------------------------------------
-/// \struct LayoutTileLeft
-/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
-///   striding of multi-indices by tiles.
-///
-/// This is an example of a \c MemoryLayout template parameter of
-/// View.  The memory layout describes how View maps from a
-/// multi-index (i0, i1, ..., ik) to a memory location.  
-///
-/// "Tiled layout" indicates a mapping to contiguously stored
-/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
-/// dimensions.  Indices are LayoutLeft within each tile, and the
-/// tiles themselves are arranged using LayoutLeft.  Note that the
-/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be
-/// compile-time constants.  This speeds up index calculations.  If
-/// both tile dimensions are powers of two, Kokkos can optimize
-/// further.
-template < unsigned ArgN0 , unsigned ArgN1 ,
-           bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value &&
-                                 Impl::is_power_of_two<ArgN1>::value )
-         >
-struct LayoutTileLeft {
-  //! Tag this class as a kokkos array layout
-  typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
-
-  enum { N0 = ArgN0 };
-  enum { N1 = ArgN1 };
-};
-
-} // namespace Kokkos
-
-#endif // #ifndef KOKKOS_LAYOUT_HPP
-
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -1,433 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_MACROS_HPP
-#define KOKKOS_MACROS_HPP
-
-//----------------------------------------------------------------------------
-/** Pick up configure/build options via #define macros:
- *
- *  KOKKOS_HAVE_CUDA                Kokkos::Cuda execution and memory spaces
- *  KOKKOS_HAVE_PTHREAD             Kokkos::Threads execution space
- *  KOKKOS_HAVE_QTHREAD             Kokkos::Qthread execution space
- *  KOKKOS_HAVE_OPENMP              Kokkos::OpenMP  execution space
- *  KOKKOS_HAVE_HWLOC               HWLOC library is available
- *  KOKKOS_HAVE_EXPRESSION_CHECK    insert array bounds checks, is expensive!
- *  KOKKOS_HAVE_CXX11               enable C++11 features
- *
- *  KOKKOS_HAVE_MPI                 negotiate MPI/execution space interactions
- *
- *  KOKKOS_USE_CUDA_UVM             Use CUDA UVM for Cuda memory space
- */
-
-#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
-#include <KokkosCore_config.h>
-#endif
-
-//----------------------------------------------------------------------------
-/** Pick up compiler specific #define macros:
- *
- *  Macros for known compilers evaluate to an integral version value
- *
- *  KOKKOS_COMPILER_NVCC
- *  KOKKOS_COMPILER_GNU
- *  KOKKOS_COMPILER_INTEL
- *  KOKKOS_COMPILER_IBM
- *  KOKKOS_COMPILER_CRAYC
- *  KOKKOS_COMPILER_APPLECC
- *  KOKKOS_COMPILER_CLANG
- *  KOKKOS_COMPILER_PGI
- *
- *  Macros for which compiler extension to use for atomics on intrinsice types
- *
- *  KOKKOS_ATOMICS_USE_CUDA
- *  KOKKOS_ATOMICS_USE_GNU
- *  KOKKOS_ATOMICS_USE_INTEL
- *  KOKKOS_ATOMICS_USE_OPENMP31
- *
- *  A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
- *
- *  Macros for marking functions to run in an execution space:
- *
- *  KOKKOS_FUNCTION
- *  KOKKOS_INLINE_FUNCTION        request compiler to inline
- *  KOKKOS_FORCEINLINE_FUNCTION   force compiler to inline, use with care!
- */
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ )
-
-/*  Compiling with a CUDA compiler.
- *
- *  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
- *    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
- *
- *  When generating device code the __CUDA_ARCH__ macro is defined as:
- *    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
- */
-
-#include <cuda_runtime.h>
-#include <cuda.h>
-
-#if ! defined( CUDA_VERSION )
-#error "#include <cuda.h> did not define CUDA_VERSION"
-#endif
-
-#if ( CUDA_VERSION < 4010 )
-#error "Cuda version 4.1 or greater required"
-#endif
-
-#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 200 )
-/*  Compiling with CUDA compiler for device code. */
-#error "Cuda device capability >= 2.0 is required"
-#endif
-
-#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */
-
-/*--------------------------------------------------------------------------*/
-/* Language info: C++, CUDA, OPENMP */
-
-#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
-  // Compiling Cuda code to 'ptx'
-
-  #define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
-  #define KOKKOS_INLINE_FUNCTION       __device__  __host__  inline
-  #define KOKKOS_FUNCTION              __device__  __host__
-
-#endif /* #if defined( __CUDA_ARCH__ ) */
-
-#if defined( _OPENMP )
-
-  /*  Compiling with OpenMP.
-   *  The value of _OPENMP is an integer value YYYYMM
-   *  where YYYY and MM are the year and month designation
-   *  of the supported OpenMP API version.
-   */
-
-#endif /* #if defined( _OPENMP ) */
-
-/*--------------------------------------------------------------------------*/
-/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
-
-#if defined( __NVCC__ )
-  // NVIDIA compiler is being used.
-  // Code is parsed and separated into host and device code.
-  // Host code is compiled again with another compiler.
-  // Device code is compile to 'ptx'.
-  #define KOKKOS_COMPILER_NVCC __NVCC__
-
-  #if defined( KOKKOS_HAVE_CXX11 ) && defined (KOKKOS_HAVE_CUDA)
-    // CUDA supports (inofficially) C++11 in device code starting with 
-    // version 6.5. This includes auto type and device code internal
-    // lambdas.
-    #if ( CUDA_VERSION < 6050 )
-      #error "NVCC does not support C++11"
-    #endif
-  #endif
-#else
-  #if defined( KOKKOS_HAVE_CXX11 )
-    // CUDA (including version 6.5) does not support giving lambdas as
-    // arguments to global functions. Thus its not currently possible
-    // to dispatch lambdas from the host.
-    #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
-  #endif
-#endif /* #if defined( __NVCC__ ) */
-
-#if defined( KOKKOS_HAVE_CXX11 ) && !defined (KOKKOS_LAMBDA)
-  #define KOKKOS_LAMBDA [=]
-#endif
-
-#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
-
-/* Intel compiler for host code */
-
-#if defined( __INTEL_COMPILER )
-  #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
-#elif defined( __ICC )
-  // Old define
-  #define KOKKOS_COMPILER_INTEL __ICC
-#elif defined( __ECC ) 
-  // Very old define
-  #define KOKKOS_COMPILER_INTEL __ECC
-#endif
-
-/* CRAY compiler for host code */
-#if defined( _CRAYC )
-  #define KOKKOS_COMPILER_CRAYC _CRAYC
-#endif
-
-#if defined( __IBMCPP__ )
-  // IBM C++
-  #define KOKKOS_COMPILER_IBM __IBMCPP__
-#elif defined( __IBMC__ )
-  #define KOKKOS_COMPILER_IBM __IBMC__
-#endif
-
-#if defined( __APPLE_CC__ )
-  #define KOKKOS_COMPILER_APPLECC __APPLE_CC__
-#endif
-
-#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL)
-  #define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
-#endif
-
-#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
-  #define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
-#endif
-
-#if defined( __PGIC__ ) && ! defined( __GNUC__ )
-  #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
-#endif
-
-#endif /* #if ! defined( __CUDA_ARCH__ ) */
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-/* Intel compiler macros */
-
-#if defined( KOKKOS_COMPILER_INTEL )
-
-  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
-  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
-  #define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
-  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
-  #define KOKKOS_HAVE_PRAGMA_SIMD 1
-
-  #if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_ENABLE_ASM )
-    #define KOKKOS_ENABLE_ASM 1
-  #endif
-
-  #if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_FORCEINLINE_FUNCTION )
-    #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
-  #endif
-
-  #if defined( __MIC__ )
-    // Compiling for Xeon Phi
-  #endif
-
-#endif
-
-/*--------------------------------------------------------------------------*/
-/* Cray compiler macros */
-
-#if defined( KOKKOS_COMPILER_CRAYC )
-
-
-#endif
-
-/*--------------------------------------------------------------------------*/
-/* IBM Compiler macros */
-
-#if defined( KOKKOS_COMPILER_IBM )
-
-  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
-  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
-  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
-  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
-  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
-
-#endif
-
-/*--------------------------------------------------------------------------*/
-/* CLANG compiler macros */
-
-#if defined( KOKKOS_COMPILER_CLANG )
-
-  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
-  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
-  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
-  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
-  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
-
-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
-    #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
-  #endif
-
-#endif
-
-/*--------------------------------------------------------------------------*/
-/* GNU Compiler macros */
-
-#if defined( KOKKOS_COMPILER_GNU ) 
-
-  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
-  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
-  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
-  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
-  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
-
-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
-    #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
-  #endif
-
-  #if ! defined( KOKKOS_ENABLE_ASM ) && \
-      ! ( defined( __powerpc) || \
-          defined(__powerpc__) || \
-          defined(__powerpc64__) || \
-          defined(__POWERPC__) || \
-          defined(__ppc__) || \
-          defined(__ppc64__) )
-    #define KOKKOS_ENABLE_ASM 1
-  #endif
-
-#endif
-
-/*--------------------------------------------------------------------------*/
-
-#if defined( KOKKOS_COMPILER_PGI )
-
-  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
-  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
-  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
-  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
-  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
-
-#endif
-
-/*--------------------------------------------------------------------------*/
-
-#if defined( KOKKOS_COMPILER_NVCC )
-
-  #if defined(__CUDA_ARCH__ )
-    #define KOKKOS_HAVE_PRAGMA_UNROLL 1
-  #endif
-
-#endif
-
-/*--------------------------------------------------------------------------*/
-/* Select compiler dependent interface for atomics */
-
-#if ! defined( KOKKOS_ATOMICS_USE_CUDA ) || \
-    ! defined( KOKKOS_ATOMICS_USE_GNU ) || \
-    ! defined( KOKKOS_ATOMICS_USE_INTEL ) || \
-    ! defined( KOKKOS_ATOMICS_USE_OPENMP31 )
-
-/* Atomic selection is not pre-defined, choose from language and compiler. */
-
-#if defined( __CUDA_ARCH__ ) && defined (KOKKOS_HAVE_CUDA)
-
-  #define KOKKOS_ATOMICS_USE_CUDA
-
-#elif defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG )
-
-  #define KOKKOS_ATOMICS_USE_GNU
-
-#elif defined( KOKKOS_COMPILER_INTEL ) || defined( KOKKOS_COMPILER_CRAYC )
-
-  #define KOKKOS_ATOMICS_USE_INTEL
-
-#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
-
-  #define KOKKOS_ATOMICS_USE_OMP31
-
-#else
-
-  #error "Compiler does not support atomic operations"
-
-#endif
-
-#endif
-
-//----------------------------------------------------------------------------
-/** Define function marking macros if compiler specific macros are undefined: */
-
-#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
-#define KOKKOS_FORCEINLINE_FUNCTION  inline
-#endif
-
-#if ! defined( KOKKOS_INLINE_FUNCTION )
-#define KOKKOS_INLINE_FUNCTION  inline
-#endif
-
-#if ! defined( KOKKOS_FUNCTION )
-#define KOKKOS_FUNCTION /**/
-#endif
-
-//----------------------------------------------------------------------------
-/** Determine the default execution space for parallel dispatch.
- *  There is zero or one default execution space specified.
- */
-
-#if 1 < ( ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
-
-#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
-
-#endif
-
-/** If default is not specified then chose from enabled execution spaces.
- *  Priority: CUDA, OPENMP, THREADS, SERIAL
- */
-#if   defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
-#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
-#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
-#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
-#elif defined ( KOKKOS_HAVE_CUDA )
-#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA
-#elif defined ( KOKKOS_HAVE_OPENMP )
-#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP
-#elif defined ( KOKKOS_HAVE_PTHREAD )
-#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS
-#else
-#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL
-#endif
-
-//----------------------------------------------------------------------------
-/** Determine for what space the code is being compiled: */
-
-#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_HAVE_CUDA)
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
-#else
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_MACROS_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@ -1,118 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_MEMORYTRAITS_HPP
-#define KOKKOS_MEMORYTRAITS_HPP
-
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/** \brief  Memory access traits for views, an extension point.
- *
- *  These traits should be orthogonal.  If there are dependencies then
- *  the MemoryTraits template must detect and enforce dependencies.
- *
- *  A zero value is the default for a View, indicating that none of
- *  these traits are present.
- */
-enum MemoryTraitsFlags
-  { Unmanaged  = 0x01
-  , RandomAccess = 0x02
-  , Atomic = 0x04
-  };
-
-template < unsigned T >
-struct MemoryTraits {
-  //! Tag this class as a kokkos memory traits:
-  typedef MemoryTraits memory_traits ;
-
-  enum { Unmanaged    = T & unsigned(Kokkos::Unmanaged) };
-  enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
-  enum { Atomic       = T & unsigned(Kokkos::Atomic) };
-
-};
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-typedef Kokkos::MemoryTraits<0> MemoryManaged ;
-typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ;
-typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryRandomAccess ;
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief Memory alignment settings
- *
- *  Sets global value for memory alignment.  Must be a power of two!
- *  Enable compatibility of views from different devices with static stride.
- *  Use compiler flag to enable overwrites.
- */
-enum { MEMORY_ALIGNMENT =
-#if defined( KOKKOS_MEMORY_ALIGNMENT )
-    ( 1 << Kokkos::Impl::power_of_two< KOKKOS_MEMORY_ALIGNMENT >::value )
-#else
-    ( 1 << Kokkos::Impl::power_of_two< 128 >::value )
-#endif
-  , MEMORY_ALIGNMENT_THRESHOLD = 4 
-  };
-
-
-} //namespace Impl
-} // namespace Kokkos
-
-#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@ -1,176 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_OPENMP_HPP
-#define KOKKOS_OPENMP_HPP
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP )
-
-#include <omp.h>
-
-#include <cstddef>
-#include <iosfwd>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_ScratchSpace.hpp>
-#include <Kokkos_Parallel.hpp>
-#include <Kokkos_Layout.hpp>
-#include <impl/Kokkos_Tags.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/// \class OpenMP
-/// \brief Kokkos device for multicore processors in the host memory space.
-class OpenMP {
-public:
-  //------------------------------------
-  //! \name Type declarations that all Kokkos devices must provide.
-  //@{
-
-  //! Tag this class as a kokkos execution space
-  typedef OpenMP                execution_space ;
-  typedef HostSpace             memory_space ;
-  typedef LayoutRight           array_layout ;
-  typedef HostSpace::size_type  size_type ;
-
-  typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
-
-  //! For backward compatibility
-  typedef OpenMP                device_type ;
-  //@}
-  //------------------------------------
-  //! \name Functions that all Kokkos devices must implement.
-  //@{
-
-  inline static bool in_parallel() { return omp_in_parallel(); }
-
-  /** \brief  Set the device in a "sleep" state. A noop for OpenMP.  */
-  static bool sleep();
-
-  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
-  static bool wake();
-
-  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
-  static void fence() {}
-
-  /// \brief Print configuration information to the given output stream.
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  /// \brief Free any resources being consumed by the device.
-  static void finalize();
-
-  /** \brief  Initialize the device.
-   *
-   *  1) If the hardware locality library is enabled and OpenMP has not
-   *     already bound threads then bind OpenMP threads to maximize
-   *     core utilization and group for memory hierarchy locality.
-   *
-   *  2) Allocate a HostThread for each OpenMP thread to hold its
-   *     topology and fan in/out data.
-   */
-  static void initialize( unsigned thread_count = 0 ,
-                          unsigned use_numa_count = 0 ,
-                          unsigned use_cores_per_numa = 0 );
-
-  static int is_initialized();
-  //@}
-  //------------------------------------
-  /** \brief  This execution space has a topological thread pool which can be queried.
-   *
-   *  All threads within a pool have a common memory space for which they are cache coherent.
-   *    depth = 0  gives the number of threads in the whole pool.
-   *    depth = 1  gives the number of threads in a NUMA region, typically sharing L3 cache.
-   *    depth = 2  gives the number of threads at the finest granularity, typically sharing L1 cache.
-   */
-  inline static int thread_pool_size( int depth = 0 );
-
-  /** \brief  The rank of the executing thread in this thread pool */
-  KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
-
-  //------------------------------------
-
-  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
-
-  KOKKOS_INLINE_FUNCTION static
-  unsigned hardware_thread_id() { return thread_pool_rank(); }
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-template<>
-struct VerifyExecutionCanAccessMemorySpace
-  < Kokkos::OpenMP::memory_space
-  , Kokkos::OpenMP::scratch_memory_space
-  >
-{
-  enum { value = true };
-  inline static void verify( void ) { }
-  inline static void verify( const void * ) { }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-#include <OpenMP/Kokkos_OpenMPexec.hpp>
-#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP ) */
-#endif /* #ifndef KOKKOS_OPENMP_HPP */
-
-
--- a/lib/kokkos/core/src/Kokkos_Pair.hpp
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@ -1,457 +0,0 @@
-/// \file Kokkos_Pair.hpp
-/// \brief Declaration and definition of Kokkos::pair.
-///
-/// This header file declares and defines Kokkos::pair and its related
-/// nonmember functions.
-
-#ifndef KOKKOS_PAIR_HPP
-#define KOKKOS_PAIR_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <utility>
-
-namespace Kokkos {
-/// \struct pair
-/// \brief Replacement for std::pair that works on CUDA devices.
-///
-/// The instance methods of std::pair, including its constructors, are
-/// not marked as <tt>__device__</tt> functions.  Thus, they cannot be
-/// called on a CUDA device, such as an NVIDIA GPU.  This struct
-/// implements the same interface as std::pair, but can be used on a
-/// CUDA device as well as on the host.
-template <class T1, class T2>
-struct pair
-{
-  //! The first template parameter of this class.
-  typedef T1 first_type;
-  //! The second template parameter of this class.
-  typedef T2 second_type;
-
-  //! The first element of the pair.
-  first_type  first;
-  //! The second element of the pair.
-  second_type second;
-
-  /// \brief Default constructor.
-  ///
-  /// This calls the default constructors of T1 and T2.  It won't
-  /// compile if those default constructors are not defined and
-  /// public.
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair()
-    : first(), second()
-  {}
-
-  /// \brief Constructor that takes both elements of the pair.
-  ///
-  /// This calls the copy constructors of T1 and T2.  It won't compile
-  /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair(first_type const& f, second_type const& s)
-    : first(f), second(s)
-  {}
-
-  /// \brief Copy constructor.
-  ///
-  /// This calls the copy constructors of T1 and T2.  It won't compile
-  /// if those copy constructors are not defined and public.
-  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair( const pair<U,V> &p)
-    : first(p.first), second(p.second)
-  {}
-
-  /// \brief Assignment operator.
-  ///
-  /// This calls the assignment operators of T1 and T2.  It won't
-  /// compile if the assignment operators are not defined and public.
-  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair<T1, T2> & operator=(const pair<U,V> &p)
-  {
-    first = p.first;
-    second = p.second;
-    return *this;
-  }
-
-  // from std::pair<U,V>
-  template <class U, class V>
-  pair( const std::pair<U,V> &p)
-    : first(p.first), second(p.second)
-  {}
-
-  /// \brief Return the std::pair version of this object.
-  ///
-  /// This is <i>not</i> a device function; you may not call it on a
-  /// CUDA device.  It is meant to be called on the host, if the user
-  /// wants an std::pair instead of a Kokkos::pair.
-  ///
-  /// \note This is not a conversion operator, since defining a
-  ///   conversion operator made the relational operators have
-  ///   ambiguous definitions.
-  std::pair<T1,T2> to_std_pair() const
-  { return std::make_pair(first,second); }
-};
-
-template <class T1, class T2>
-struct pair<T1&, T2&>
-{
-  //! The first template parameter of this class.
-  typedef T1& first_type;
-  //! The second template parameter of this class.
-  typedef T2& second_type;
-
-  //! The first element of the pair.
-  first_type  first;
-  //! The second element of the pair.
-  second_type second;
-
-  /// \brief Constructor that takes both elements of the pair.
-  ///
-  /// This calls the copy constructors of T1 and T2.  It won't compile
-  /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair(first_type f, second_type s)
-    : first(f), second(s)
-  {}
-
-  /// \brief Copy constructor.
-  ///
-  /// This calls the copy constructors of T1 and T2.  It won't compile
-  /// if those copy constructors are not defined and public.
-  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair( const pair<U,V> &p)
-    : first(p.first), second(p.second)
-  {}
-
-  // from std::pair<U,V>
-  template <class U, class V>
-  pair( const std::pair<U,V> &p)
-    : first(p.first), second(p.second)
-  {}
-
-  /// \brief Assignment operator.
-  ///
-  /// This calls the assignment operators of T1 and T2.  It won't
-  /// compile if the assignment operators are not defined and public.
-  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair<first_type, second_type> & operator=(const pair<U,V> &p)
-  {
-    first = p.first;
-    second = p.second;
-    return *this;
-  }
-
-  /// \brief Return the std::pair version of this object.
-  ///
-  /// This is <i>not</i> a device function; you may not call it on a
-  /// CUDA device.  It is meant to be called on the host, if the user
-  /// wants an std::pair instead of a Kokkos::pair.
-  ///
-  /// \note This is not a conversion operator, since defining a
-  ///   conversion operator made the relational operators have
-  ///   ambiguous definitions.
-  std::pair<T1,T2> to_std_pair() const
-  { return std::make_pair(first,second); }
-};
-
-template <class T1, class T2>
-struct pair<T1, T2&>
-{
-  //! The first template parameter of this class.
-  typedef T1  first_type;
-  //! The second template parameter of this class.
-  typedef T2& second_type;
-
-  //! The first element of the pair.
-  first_type  first;
-  //! The second element of the pair.
-  second_type second;
-
-  /// \brief Constructor that takes both elements of the pair.
-  ///
-  /// This calls the copy constructors of T1 and T2.  It won't compile
-  /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair(first_type const& f, second_type s)
-    : first(f), second(s)
-  {}
-
-  /// \brief Copy constructor.
-  ///
-  /// This calls the copy constructors of T1 and T2.  It won't compile
-  /// if those copy constructors are not defined and public.
-  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair( const pair<U,V> &p)
-    : first(p.first), second(p.second)
-  {}
-
-  // from std::pair<U,V>
-  template <class U, class V>
-  pair( const std::pair<U,V> &p)
-    : first(p.first), second(p.second)
-  {}
-
-  /// \brief Assignment operator.
-  ///
-  /// This calls the assignment operators of T1 and T2.  It won't
-  /// compile if the assignment operators are not defined and public.
-  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair<first_type, second_type> & operator=(const pair<U,V> &p)
-  {
-    first = p.first;
-    second = p.second;
-    return *this;
-  }
-
-  /// \brief Return the std::pair version of this object.
-  ///
-  /// This is <i>not</i> a device function; you may not call it on a
-  /// CUDA device.  It is meant to be called on the host, if the user
-  /// wants an std::pair instead of a Kokkos::pair.
-  ///
-  /// \note This is not a conversion operator, since defining a
-  ///   conversion operator made the relational operators have
-  ///   ambiguous definitions.
-  std::pair<T1,T2> to_std_pair() const
-  { return std::make_pair(first,second); }
-};
-
-template <class T1, class T2>
-struct pair<T1&, T2>
-{
-  //! The first template parameter of this class.
-  typedef T1&  first_type;
-  //! The second template parameter of this class.
-  typedef T2 second_type;
-
-  //! The first element of the pair.
-  first_type  first;
-  //! The second element of the pair.
-  second_type second;
-
-  /// \brief Constructor that takes both elements of the pair.
-  ///
-  /// This calls the copy constructors of T1 and T2.  It won't compile
-  /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair(first_type f, second_type const& s)
-    : first(f), second(s)
-  {}
-
-  /// \brief Copy constructor.
-  ///
-  /// This calls the copy constructors of T1 and T2.  It won't compile
-  /// if those copy constructors are not defined and public.
-  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair( const pair<U,V> &p)
-    : first(p.first), second(p.second)
-  {}
-
-  // from std::pair<U,V>
-  template <class U, class V>
-  pair( const std::pair<U,V> &p)
-    : first(p.first), second(p.second)
-  {}
-
-  /// \brief Assignment operator.
-  ///
-  /// This calls the assignment operators of T1 and T2.  It won't
-  /// compile if the assignment operators are not defined and public.
-  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair<first_type, second_type> & operator=(const pair<U,V> &p)
-  {
-    first = p.first;
-    second = p.second;
-    return *this;
-  }
-
-  /// \brief Return the std::pair version of this object.
-  ///
-  /// This is <i>not</i> a device function; you may not call it on a
-  /// CUDA device.  It is meant to be called on the host, if the user
-  /// wants an std::pair instead of a Kokkos::pair.
-  ///
-  /// \note This is not a conversion operator, since defining a
-  ///   conversion operator made the relational operators have
-  ///   ambiguous definitions.
-  std::pair<T1,T2> to_std_pair() const
-  { return std::make_pair(first,second); }
-};
-
-//! Equality operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return lhs.first==rhs.first && lhs.second==rhs.second; }
-
-//! Inequality operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return !(lhs==rhs); }
-
-//! Less-than operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
-
-//! Less-than-or-equal-to operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return !(rhs<lhs); }
-
-//! Greater-than operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return rhs<lhs; }
-
-//! Greater-than-or-equal-to operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return !(lhs<rhs); }
-
-/// \brief Return a new pair.
-///
-/// This is a "nonmember constructor" for Kokkos::pair.  It works just
-/// like std::make_pair.
-template <class T1,class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-pair<T1,T2> make_pair (T1 x, T2 y)
-{ return ( pair<T1,T2>(x,y) ); }
-
-/// \brief Return a pair of references to the input arguments.
-///
-/// This compares to std::tie (new in C++11).  You can use it to
-/// assign to two variables at once, from the result of a function
-/// that returns a pair.  For example (<tt>__device__</tt> and
-/// <tt>__host__</tt> attributes omitted for brevity):
-/// \code
-/// // Declaration of the function to call.
-/// // First return value: operation count.
-/// // Second return value: whether all operations succeeded.
-/// Kokkos::pair<int, bool> someFunction ();
-///
-/// // Code that uses Kokkos::tie.
-/// int myFunction () {
-///   int count = 0;
-///   bool success = false;
-///
-///   // This assigns to both count and success.
-///   Kokkos::tie (count, success) = someFunction ();
-///
-///   if (! success) {
-///     // ... Some operation failed;
-///     //     take corrective action ...
-///   }
-///   return count;
-/// }
-/// \endcode
-///
-/// The line that uses tie() could have been written like this:
-/// \code
-///   Kokkos::pair<int, bool> result = someFunction ();
-///   count = result.first;
-///   success = result.second;
-/// \endcode
-///
-/// Using tie() saves two lines of code and avoids a copy of each
-/// element of the pair.  The latter could be significant if one or
-/// both elements of the pair are more substantial objects than \c int
-/// or \c bool.
-template <class T1,class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-pair<T1 &,T2 &> tie (T1 & x, T2 & y)
-{ return ( pair<T1 &,T2 &>(x,y) ); }
-
-//
-// Specialization of Kokkos::pair for a \c void second argument.  This
-// is not actually a "pair"; it only contains one element, the first.
-//
-template <class T1>
-struct pair<T1,void>
-{
-  typedef T1 first_type;
-  typedef void second_type;
-
-  first_type  first;
-  enum { second = 0 };
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair()
-    : first()
-  {}
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair(const first_type & f)
-    : first(f)
-  {}
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair(const first_type & f, int)
-    : first(f)
-  {}
-
-  template <class U>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair( const pair<U,void> &p)
-    : first(p.first)
-  {}
-
-  template <class U>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair<T1, void> & operator=(const pair<U,void> &p)
-  {
-    first = p.first;
-    return *this;
-  }
-};
-
-//
-// Specialization of relational operators for Kokkos::pair<T1,void>.
-//
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return lhs.first==rhs.first; }
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return !(lhs==rhs); }
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return lhs.first<rhs.first; }
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return !(rhs<lhs); }
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return rhs<lhs; }
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return !(lhs<rhs); }
-
-} // namespace Kokkos
-
-
-#endif //KOKKOS_PAIR_HPP
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@ -1,598 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Kokkos_Parallel.hpp
-/// \brief Declaration of parallel operators
-
-#ifndef KOKKOS_PARALLEL_HPP
-#define KOKKOS_PARALLEL_HPP
-
-#include <cstddef>
-#include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_View.hpp>
-#include <Kokkos_ExecPolicy.hpp>
-
-#include <impl/Kokkos_Tags.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-/** \brief  Given a Functor and Execution Policy query an execution space.
- *
- *  if       the Policy has an execution space use that
- *  else if  the Functor has an execution_space use that
- *  else if  the Functor has a device_type use that for backward compatibility
- *  else     use the default
- */
-template< class Functor
-        , class Policy
-        , class EnableFunctor = void
-        , class EnablePolicy  = void
-        >
-struct FunctorPolicyExecutionSpace {
-  typedef Kokkos::DefaultExecutionSpace execution_space ;
-};
-
-template< class Functor , class Policy >
-struct FunctorPolicyExecutionSpace
-  < Functor , Policy
-  , typename enable_if_type< typename Functor::device_type     >::type
-  , typename enable_if_type< typename Policy ::execution_space >::type
-  >
-{
-  typedef typename Policy ::execution_space execution_space ;
-};
-
-template< class Functor , class Policy >
-struct FunctorPolicyExecutionSpace
-  < Functor , Policy
-  , typename enable_if_type< typename Functor::execution_space >::type
-  , typename enable_if_type< typename Policy ::execution_space >::type
-  >
-{
-  typedef typename Policy ::execution_space execution_space ;
-};
-
-template< class Functor , class Policy , class EnableFunctor >
-struct FunctorPolicyExecutionSpace
-  < Functor , Policy
-  , EnableFunctor
-  , typename enable_if_type< typename Policy::execution_space >::type
-  >
-{
-  typedef typename Policy ::execution_space execution_space ;
-};
-
-template< class Functor , class Policy , class EnablePolicy >
-struct FunctorPolicyExecutionSpace
-  < Functor , Policy
-  , typename enable_if_type< typename Functor::device_type >::type
-  , EnablePolicy
-  >
-{
-  typedef typename Functor::device_type execution_space ;
-};
-
-template< class Functor , class Policy , class EnablePolicy >
-struct FunctorPolicyExecutionSpace
-  < Functor , Policy
-  , typename enable_if_type< typename Functor::execution_space >::type
-  , EnablePolicy
-  >
-{
-  typedef typename Functor::execution_space execution_space ;
-};
-
-//----------------------------------------------------------------------------
-/// \class ParallelFor
-/// \brief Implementation of the ParallelFor operator that has a
-///   partial specialization for the device.
-///
-/// This is an implementation detail of parallel_for.  Users should
-/// skip this and go directly to the nonmember function parallel_for.
-template< class FunctorType , class ExecPolicy > class ParallelFor ;
-
-/// \class ParallelReduce
-/// \brief Implementation detail of parallel_reduce.
-///
-/// This is an implementation detail of parallel_reduce.  Users should
-/// skip this and go directly to the nonmember function parallel_reduce.
-template< class FunctorType , class ExecPolicy > class ParallelReduce ;
-
-/// \class ParallelScan
-/// \brief Implementation detail of parallel_scan.
-///
-/// This is an implementation detail of parallel_scan.  Users should
-/// skip this and go directly to the documentation of the nonmember
-/// template function Kokkos::parallel_scan.
-template< class FunctorType , class ExecPolicy > class ParallelScan ;
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/** \brief Execute \c functor in parallel according to the execution \c policy.
- *
- * A "functor" is a class containing the function to execute in parallel,
- * data needed for that execution, and an optional \c execution_space
- * typedef.  Here is an example functor for parallel_for:
- *
- * \code
- *  class FunctorType {
- *  public:
- *    typedef  ...  execution_space ;
- *    void operator() ( WorkType iwork ) const ;
- *  };
- * \endcode
- *
- * In the above example, \c WorkType is any integer type for which a
- * valid conversion from \c size_t to \c IntType exists.  Its
- * <tt>operator()</tt> method defines the operation to parallelize,
- * over the range of integer indices <tt>iwork=[0,work_count-1]</tt>.
- * This compares to a single iteration \c iwork of a \c for loop.
- * If \c execution_space is not defined DefaultExecutionSpace will be used.
- */
-template< class ExecPolicy , class FunctorType >
-inline
-void parallel_for( const ExecPolicy  & policy
-                 , const FunctorType & functor
-                 , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
-                 )
-{
-  (void) Impl::ParallelFor< FunctorType , ExecPolicy >( functor , policy );
-}
-
-template< class FunctorType >
-inline
-void parallel_for( const size_t        work_count ,
-                   const FunctorType & functor )
-{
-  typedef typename
-    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-  typedef RangePolicy< execution_space > policy ;
-  (void) Impl::ParallelFor< FunctorType , policy >( functor , policy(0,work_count) );
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-/** \brief  Parallel reduction
- *
- * Example of a parallel_reduce functor for a POD (plain old data) value type:
- * \code
- *  class FunctorType { // For POD value type
- *  public:
- *    typedef    ...     execution_space ;
- *    typedef <podType>  value_type ;
- *    void operator()( <intType> iwork , <podType> & update ) const ;
- *    void init( <podType> & update ) const ;
- *    void join( volatile       <podType> & update ,
- *               volatile const <podType> & input ) const ;
- *
- *    typedef true_type has_final ;
- *    void final( <podType> & update ) const ;
- *  };
- * \endcode
- *
- * Example of a parallel_reduce functor for an array of POD (plain old data) values:
- * \code
- *  class FunctorType { // For array of POD value
- *  public:
- *    typedef    ...     execution_space ;
- *    typedef <podType>  value_type[] ;
- *    void operator()( <intType> , <podType> update[] ) const ;
- *    void init( <podType> update[] ) const ;
- *    void join( volatile       <podType> update[] ,
- *               volatile const <podType> input[] ) const ;
- *
- *    typedef true_type has_final ;
- *    void final( <podType> update[] ) const ;
- *  };
- * \endcode
- */
-template< class ExecPolicy , class FunctorType >
-inline
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
-                    )
-{
-  (void) Impl::ParallelReduce< FunctorType , ExecPolicy >( functor , policy );
-}
-
-// integral range policy
-template< class FunctorType >
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorType & functor
-                    )
-{
-  typedef typename
-    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-
-  typedef RangePolicy< execution_space > policy ;
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view ;
-
-  (void) Impl::ParallelReduce< FunctorType , policy >( functor , policy(0,work_count) , result_view );
-}
-
-// general policy and view ouput
-template< class ExecPolicy , class FunctorType , class ViewType >
-inline
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , const ViewType    & result_view
-                    , typename Impl::enable_if<
-                      ( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
-                      )>::type * = 0 )
-{
-  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , policy , result_view );
-}
-
-// general policy and pod or array of pod output
-template< class ExecPolicy , class FunctorType >
-inline
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , typename Impl::enable_if<
-                      ( ! Impl::is_integral< ExecPolicy >::value )
-                      , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type
-                      >::type result_ref )
-{
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , typename ExecPolicy::work_tag >  ValueOps ;
-
-  // Wrap the result output request in a view to inform the implementation
-  // of the type and memory space.
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view( ValueOps::pointer( result_ref )
-               , ValueTraits::value_count( functor )
-               );
-
-  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , policy , result_view );
-}
-
-// integral range policy and view ouput
-template< class FunctorType , class ViewType >
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorType & functor
-                    , const ViewType    & result_view
-                    , typename Impl::enable_if<( Impl::is_view<ViewType>::value )>::type * = 0 )
-{
-  typedef typename
-    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-
-  typedef RangePolicy< execution_space > ExecPolicy ;
-
-  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , ExecPolicy(0,work_count) , result_view );
-}
-
-// integral range policy and pod or array of pod output
-template< class FunctorType >
-inline
-void parallel_reduce( const size_t        work_count ,
-                      const FunctorType & functor ,
-                      typename Kokkos::Impl::FunctorValueTraits< FunctorType , void >::reference_type result )
-{
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , void >  ValueOps ;
-
-  typedef typename
-    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-
-  typedef Kokkos::RangePolicy< execution_space > policy ;
-
-  // Wrap the result output request in a view to inform the implementation
-  // of the type and memory space.
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view( ValueOps::pointer( result )
-               , ValueTraits::value_count( functor )
-               );
-
-  (void) Impl::ParallelReduce< FunctorType , policy >( functor , policy(0,work_count) , result_view );
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/// \fn parallel_scan
-/// \tparam ExecutionPolicy The execution policy type.
-/// \tparam FunctorType     The scan functor type.
-///
-/// \param policy  [in] The execution policy.
-/// \param functor [in] The scan functor.
-///
-/// This function implements a parallel scan pattern.  The scan can
-/// be either inclusive or exclusive, depending on how you implement
-/// the scan functor.
-///
-/// A scan functor looks almost exactly like a reduce functor, except
-/// that its operator() takes a third \c bool argument, \c final_pass,
-/// which indicates whether this is the last pass of the scan
-/// operation.  We will show below how to use the \c final_pass
-/// argument to control whether the scan is inclusive or exclusive.
-///
-/// Here is the minimum required interface of a scan functor for a POD
-/// (plain old data) value type \c PodType.  That is, the result is a
-/// View of zero or more PodType.  It is also possible for the result
-/// to be an array of (same-sized) arrays of PodType, but we do not
-/// show the required interface for that here.
-/// \code
-/// template< class ExecPolicy , class FunctorType >
-/// class ScanFunctor {
-/// public:
-///   // The Kokkos device type
-///   typedef ... execution_space;
-///   // Type of an entry of the array containing the result;
-///   // also the type of each of the entries combined using
-///   // operator() or join().
-///   typedef PodType value_type;
-///
-///   void operator () (const ExecPolicy::member_type & i, value_type& update, const bool final_pass) const;
-///   void init (value_type& update) const;
-///   void join (volatile value_type& update, volatile const value_type& input) const
-/// };
-/// \endcode
-///
-/// Here is an example of a functor which computes an inclusive plus-scan
-/// of an array of \c int, in place.  If given an array [1, 2, 3, 4], this
-/// scan will overwrite that array with [1, 3, 6, 10].
-///
-/// \code
-/// template<class SpaceType>
-/// class InclScanFunctor {
-/// public:
-///   typedef SpaceType execution_space;
-///   typedef int value_type;
-///   typedef typename SpaceType::size_type size_type;
-///
-///   InclScanFunctor( Kokkos::View<value_type*, execution_space> x
-///                  , Kokkos::View<value_type*, execution_space> y ) : m_x(x), m_y(y) {}
-///
-///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
-///     update += m_x(i);
-///     if (final_pass) {
-///       m_y(i) = update;
-///     }
-///   }
-///   void init (value_type& update) const {
-///     update = 0;
-///   }
-///   void join (volatile value_type& update, volatile const value_type& input) const {
-///     update += input;
-///   }
-///
-/// private:
-///   Kokkos::View<value_type*, execution_space> m_x;
-///   Kokkos::View<value_type*, execution_space> m_y;
-/// };
-/// \endcode
-///
-/// Here is an example of a functor which computes an <i>exclusive</i>
-/// scan of an array of \c int, in place.  In operator(), note both
-/// that the final_pass test and the update have switched places, and
-/// the use of a temporary.  If given an array [1, 2, 3, 4], this scan
-/// will overwrite that array with [0, 1, 3, 6].
-///
-/// \code
-/// template<class SpaceType>
-/// class ExclScanFunctor {
-/// public:
-///   typedef SpaceType execution_space;
-///   typedef int value_type;
-///   typedef typename SpaceType::size_type size_type;
-///
-///   ExclScanFunctor (Kokkos::View<value_type*, execution_space> x) : x_ (x) {}
-///
-///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
-///     const value_type x_i = x_(i);
-///     if (final_pass) {
-///       x_(i) = update;
-///     }
-///     update += x_i;
-///   }
-///   void init (value_type& update) const {
-///     update = 0;
-///   }
-///   void join (volatile value_type& update, volatile const value_type& input) const {
-///     update += input;
-///   }
-///
-/// private:
-///   Kokkos::View<value_type*, execution_space> x_;
-/// };
-/// \endcode
-///
-/// Here is an example of a functor which builds on the above
-/// exclusive scan example, to compute an offsets array from a
-/// population count array, in place.  We assume that the pop count
-/// array has an extra entry at the end to store the final count.  If
-/// given an array [1, 2, 3, 4, 0], this scan will overwrite that
-/// array with [0, 1, 3, 6, 10].
-///
-/// \code
-/// template<class SpaceType>
-/// class OffsetScanFunctor {
-/// public:
-///   typedef SpaceType execution_space;
-///   typedef int value_type;
-///   typedef typename SpaceType::size_type size_type;
-///
-///   // lastIndex_ is the last valid index (zero-based) of x.
-///   // If x has length zero, then lastIndex_ won't be used anyway.
-///   OffsetScanFunctor( Kokkos::View<value_type*, execution_space> x
-///                    , Kokkos::View<value_type*, execution_space> y )
-///      : m_x(x), m_y(y), last_index_ (x.dimension_0 () == 0 ? 0 : x.dimension_0 () - 1)
-///   {}
-///
-///   void operator () (const size_type i, int& update, const bool final_pass) const {
-///     if (final_pass) {
-///       m_y(i) = update;
-///     }
-///     update += m_x(i);
-///     // The last entry of m_y gets the final sum.
-///     if (final_pass && i == last_index_) {
-///       m_y(i+1) = update;
-///     }
-///   }
-///   void init (value_type& update) const {
-///     update = 0;
-///   }
-///   void join (volatile value_type& update, volatile const value_type& input) const {
-///     update += input;
-///   }
-///
-/// private:
-///   Kokkos::View<value_type*, execution_space> m_x;
-///   Kokkos::View<value_type*, execution_space> m_y;
-///   const size_type last_index_;
-/// };
-/// \endcode
-///
-template< class ExecutionPolicy , class FunctorType >
-inline
-void parallel_scan( const ExecutionPolicy & policy
-                  , const FunctorType     & functor
-                  , typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
-                  )
-{
-  Impl::ParallelScan< FunctorType , ExecutionPolicy > scan( functor , policy );
-}
-
-template< class FunctorType >
-inline
-void parallel_scan( const size_t        work_count ,
-                    const FunctorType & functor )
-{
-  typedef typename
-    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-
-  typedef Kokkos::RangePolicy< execution_space > policy ;
-
-  (void) Impl::ParallelScan< FunctorType , policy >( functor , policy(0,work_count) );
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class Enable = void >
-struct FunctorTeamShmemSize
-{
-  static inline size_t value( const FunctorType & , int ) { return 0 ; }
-};
-
-template< class FunctorType >
-struct FunctorTeamShmemSize< FunctorType , typename enable_if< sizeof( & FunctorType::team_shmem_size ) >::type >
-{
-  static inline size_t value( const FunctorType & f , int team_size ) { return f.team_shmem_size( team_size ) ; }
-};
-
-template< class FunctorType >
-struct FunctorTeamShmemSize< FunctorType , typename enable_if< sizeof( & FunctorType::shmem_size ) >::type >
-{
-  static inline size_t value( const FunctorType & f , int team_size ) { return f.shmem_size( team_size ) ; }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* KOKKOS_PARALLEL_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_Qthread.hpp
+++ b/lib/kokkos/core/src/Kokkos_Qthread.hpp
@ -1,165 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_QTHREAD_HPP
-#define KOKKOS_QTHREAD_HPP
-
-#include <cstddef>
-#include <iosfwd>
-#include <Kokkos_Core.hpp>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_ExecPolicy.hpp>
-#include <impl/Kokkos_Tags.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-class QthreadExec ;
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/** \brief  Execution space supported by Qthread */
-class Qthread {
-public:
-  //! \name Type declarations that all Kokkos devices must provide.
-  //@{
-
-  //! Tag this class as an execution space
-  typedef Qthread                  execution_space ;
-  typedef Kokkos::HostSpace        memory_space ;
-  typedef Kokkos::LayoutRight      array_layout ;
-  typedef memory_space::size_type  size_type ;
-
-  typedef ScratchMemorySpace< Qthread > scratch_memory_space ;
-
-  //! For backward compatibility:
-  typedef Qthread                  device_type ;
-
-  //@}
-  /*------------------------------------------------------------------------*/
-
-  /** \brief  Initialization will construct one or more instances */
-  static Qthread & instance( int = 0 );
-
-  /** \brief  Set the execution space to a "sleep" state.
-   *
-   * This function sets the "sleep" state in which it is not ready for work.
-   * This may consume less resources than in an "ready" state,
-   * but it may also take time to transition to the "ready" state.
-   *
-   * \return True if enters or is in the "sleep" state.
-   *         False if functions are currently executing.
-   */
-  bool sleep();
-
-  /** \brief  Wake from the sleep state.
-   * 
-   *  \return True if enters or is in the "ready" state.
-   *          False if functions are currently executing.
-   */
-  static bool wake();
-
-  /** \brief Wait until all dispatched functions to complete.
-   * 
-   *  The parallel_for or parallel_reduce dispatch of a functor may
-   *  return asynchronously, before the functor completes.  This
-   *  method does not return until all dispatched functors on this
-   *  device have completed.
-   */
-  static void fence();
-
-  /*------------------------------------------------------------------------*/
-
-  static void initialize( int thread_count );
-  static void finalize();
-
-  /** \brief Print configuration information to the given output stream. */
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  int shepherd_size() const ;
-  int shepherd_worker_size() const ;
-};
-
-/*--------------------------------------------------------------------------*/
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-template<>
-struct VerifyExecutionCanAccessMemorySpace
-  < Kokkos::Qthread::memory_space
-  , Kokkos::Qthread::scratch_memory_space
-  >
-{
-  enum { value = true };
-  inline static void verify( void ) { }
-  inline static void verify( const void * ) { }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-#include <Kokkos_Parallel.hpp>
-#include <Qthread/Kokkos_QthreadExec.hpp>
-#include <Qthread/Kokkos_Qthread_Parallel.hpp>
-
-#endif /* #define KOKKOS_QTHREAD_HPP */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@ -1,115 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_SCRATCHSPACE_HPP
-#define KOKKOS_SCRATCHSPACE_HPP
-
-#include <stdio.h>
-#include <Kokkos_Core_fwd.hpp>
-#include <impl/Kokkos_Tags.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/** \brief  Scratch memory space associated with an execution space.
- *
- */
-template< class ExecSpace >
-class ScratchMemorySpace {
-public:
-
-  // Alignment of memory chunks returned by 'get'
-  // must be a power of two
-  enum { ALIGN = 8 };
-
-private:
-
-  mutable char * m_iter ;
-  char *         m_end ;
-
-  ScratchMemorySpace();
-  ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
-
-  enum { MASK = ALIGN - 1 }; // Alignment used by View::shmem_size
-
-public:
-
-  //! Tag this class as a memory space
-  typedef ScratchMemorySpace                memory_space ;
-  typedef ExecSpace                         execution_space ;
-  typedef typename ExecSpace::array_layout  array_layout ;
-  typedef typename ExecSpace::size_type     size_type ;
-
-  template< typename IntType >
-  KOKKOS_INLINE_FUNCTION static
-  IntType align( const IntType & size )
-    { return ( size + MASK ) & ~MASK ; }
-
-  template< typename IntType >
-  KOKKOS_INLINE_FUNCTION
-  void* get_shmem (const IntType& size) const {
-    void* tmp = m_iter ;
-    if (m_end < (m_iter += align (size))) {
-      m_iter -= align (size); // put it back like it was
-      printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate %ld byte(s); remaining capacity is %ld byte(s)\n", long(size), long(m_end-m_iter));
-      tmp = 0;
-    }
-    return tmp;
-  }
-
-  template< typename IntType >
-  KOKKOS_INLINE_FUNCTION
-  ScratchMemorySpace( void * ptr , const IntType & size )
-    : m_iter( (char *) ptr )
-    , m_end(  m_iter + size )
-    {}
-};
-
-} // namespace Kokkos
-
-#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -1,879 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Kokkos_Serial.hpp
-/// \brief Declaration and definition of Kokkos::Serial device.
-
-#ifndef KOKKOS_SERIAL_HPP
-#define KOKKOS_SERIAL_HPP
-
-#include <cstddef>
-#include <iosfwd>
-#include <Kokkos_Parallel.hpp>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_ScratchSpace.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
-
-#if defined( KOKKOS_HAVE_SERIAL )
-
-namespace Kokkos {
-
-/// \class Serial
-/// \brief Kokkos device for non-parallel execution
-///
-/// A "device" represents a parallel execution model.  It tells Kokkos
-/// how to parallelize the execution of kernels in a parallel_for or
-/// parallel_reduce.  For example, the Threads device uses Pthreads or
-/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
-/// extensions, and the Cuda device uses NVIDIA's CUDA programming
-/// model.  The Serial device executes "parallel" kernels
-/// sequentially.  This is useful if you really do not want to use
-/// threads, or if you want to explore different combinations of MPI
-/// and shared-memory parallel programming models.
-class Serial {
-public:
-  //! \name Type declarations that all Kokkos devices must provide.
-  //@{
-
-  //! Tag this class as an execution space:
-  typedef Serial                execution_space ;
-  //! The size_type typedef best suited for this device.
-  typedef HostSpace::size_type  size_type ;
-  //! This device's preferred memory space.
-  typedef HostSpace             memory_space ;
-  //! This device's preferred array layout.
-  typedef LayoutRight           array_layout ;
-
-  /// \brief  Scratch memory space
-  typedef ScratchMemorySpace< Kokkos::Serial >  scratch_memory_space ;
-
-  //! For backward compatibility:
-  typedef Serial                device_type ;
-
-  //@}
-
-  /// \brief True if and only if this method is being called in a
-  ///   thread-parallel function.
-  ///
-  /// For the Serial device, this method <i>always</i> returns false,
-  /// because parallel_for or parallel_reduce with the Serial device
-  /// always execute sequentially.
-  inline static int in_parallel() { return false ; }
-
-  /** \brief  Set the device in a "sleep" state.
-   *
-   * This function sets the device in a "sleep" state in which it is
-   * not ready for work.  This may consume less resources than if the
-   * device were in an "awake" state, but it may also take time to
-   * bring the device from a sleep state to be ready for work.
-   *
-   * \return True if the device is in the "sleep" state, else false if
-   *   the device is actively working and could not enter the "sleep"
-   *   state.
-   */
-  static bool sleep();
-
-  /// \brief Wake the device from the 'sleep' state so it is ready for work.
-  ///
-  /// \return True if the device is in the "ready" state, else "false"
-  ///  if the device is actively working (which also means that it's
-  ///  awake).
-  static bool wake();
-
-  /// \brief Wait until all dispatched functors complete.
-  ///
-  /// The parallel_for or parallel_reduce dispatch of a functor may
-  /// return asynchronously, before the functor completes.  This
-  /// method does not return until all dispatched functors on this
-  /// device have completed.
-  static void fence() {}
-
-  static void initialize( unsigned threads_count = 1 ,
-                          unsigned use_numa_count = 0 ,
-                          unsigned use_cores_per_numa = 0 ,
-                          bool allow_asynchronous_threadpool = false) {
-    (void) threads_count;
-    (void) use_numa_count;
-    (void) use_cores_per_numa;
-    (void) allow_asynchronous_threadpool;
-  }
-
-  static int is_initialized() { return 1 ; }
-
-  //! Free any resources being consumed by the device.
-  static void finalize() {}
-
-  //! Print configuration information to the given output stream.
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  //--------------------------------------------------------------------------
-
-  inline static int thread_pool_size( int = 0 ) { return 1 ; }
-  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
-
-  //--------------------------------------------------------------------------
-
-  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
-  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
-
-  //--------------------------------------------------------------------------
-
-  static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
-
-  //--------------------------------------------------------------------------
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-template<>
-struct VerifyExecutionCanAccessMemorySpace
-  < Kokkos::Serial::memory_space
-  , Kokkos::Serial::scratch_memory_space
-  >
-{
-  enum { value = true };
-  inline static void verify( void ) { }
-  inline static void verify( const void * ) { }
-};
-
-namespace SerialImpl {
-
-struct Sentinel {
-
-  void *   m_scratch ;
-  unsigned m_reduce_end ;
-  unsigned m_shared_end ;
-
-  Sentinel();
-  ~Sentinel();
-  static Sentinel & singleton();
-};
-
-inline
-unsigned align( unsigned n );
-}
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-class SerialTeamMember {
-private:
-  typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
-  const scratch_memory_space  m_space ;
-  const int                   m_league_rank ;
-  const int                   m_league_size ;
-
-  SerialTeamMember & operator = ( const SerialTeamMember & );
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_shmem() const { return m_space ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
-
-  template<class ValueType>
-  KOKKOS_INLINE_FUNCTION
-  void team_broadcast(const ValueType& , const int& ) const {}
-
-  template< class ValueType, class JoinOp >
-  KOKKOS_INLINE_FUNCTION
-  ValueType team_reduce( const ValueType & value
-                                         , const JoinOp & ) const
-    {
-      return value ;
-    }
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
-    {
-      const Type tmp = global_accum ? *global_accum : Type(0) ;
-      if ( global_accum ) { *global_accum += value ; }
-      return tmp ;
-    }
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
-    { return Type(0); }
-
-#ifdef KOKKOS_HAVE_CXX11
-
-  /** \brief  Executes op(iType i) for each i=0..N-1.
-   *
-   * This functionality requires C++11 support.*/
-  template< typename iType, class Operation>
-  KOKKOS_INLINE_FUNCTION void team_par_for(const iType n, const Operation & op) const {
-    for(int i=0; i<n ; i++) {
-      op(i);
-    }
-  }
-
-#endif
-
-  //----------------------------------------
-  // Execution space specific:
-
-  SerialTeamMember( int arg_league_rank
-                  , int arg_league_size
-                  , int arg_shared_size
-                  );
-};
-
-} // namespace Impl
-
-
-/*
- * < Kokkos::Serial , WorkArgTag >
- * < WorkArgTag , Impl::enable_if< Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
- *
- */
-template< class Arg0 , class Arg1 >
-class TeamPolicy< Arg0 , Arg1 , Kokkos::Serial >
-{
-private:
-
-  const int m_league_size ;
-
-public:
-
-  //! Tag this class as a kokkos execution policy
-  typedef TeamPolicy      execution_policy ;
-
-  //! Execution space of this execution policy:
-  typedef Kokkos::Serial  execution_space ;
-
-  typedef typename
-    Impl::if_c< ! Impl::is_same< Kokkos::Serial , Arg0 >::value , Arg0 , Arg1 >::type
-      work_tag ;
-
-  //----------------------------------------
-
-  template< class FunctorType >
-  static
-  int team_size_max( const FunctorType & ) { return 1 ; }
-
-  template< class FunctorType >
-  static
-  int team_size_recommended( const FunctorType & ) { return 1 ; }
-
-  //----------------------------------------
-
-  inline int team_size() const { return 1 ; }
-  inline int league_size() const { return m_league_size ; }
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicy( execution_space & , int league_size_request , int /* team_size_request */  , int vector_length_request = 1 )
-    : m_league_size( league_size_request )
-    { (void) vector_length_request; }
-
-  TeamPolicy( int league_size_request , int /* team_size_request */ , int vector_length_request = 1 )
-    : m_league_size( league_size_request )
-    { (void) vector_length_request; }
-
-  typedef Impl::SerialTeamMember  member_type ;
-};
-
-} /* namespace Kokkos */
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
-
-public:
-  // work tag is void
-  template< class PType >
-  inline
-  ParallelFor( typename Impl::enable_if<
-                 ( Impl::is_same< PType , Policy >::value &&
-                   Impl::is_same< typename PType::work_tag , void >::value
-                 ), const FunctorType & >::type functor
-             , const PType & policy )
-    {
-      const typename PType::member_type e = policy.end();
-      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
-        functor( i );
-      }
-    }
-
-  // work tag is non-void
-  template< class PType >
-  inline
-  ParallelFor( typename Impl::enable_if<
-                 ( Impl::is_same< PType , Policy >::value &&
-                   ! Impl::is_same< typename PType::work_tag , void >::value
-                 ), const FunctorType & >::type functor
-             , const PType & policy )
-    {
-      const typename PType::member_type e = policy.end();
-      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
-        functor( typename PType::work_tag() , i );
-      }
-    }
-};
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
-{
-public:
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
-  typedef typename Policy::work_tag                                  WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  // Work tag is void
-  template< class ViewType , class PType >
-  ParallelReduce( typename Impl::enable_if<
-                    ( Impl::is_view< ViewType >::value &&
-                      Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
-                      Impl::is_same< PType , Policy >::value &&
-                      Impl::is_same< typename PType::work_tag , void >::value
-                    ), const FunctorType & >::type functor
-                , const PType     & policy
-                , const ViewType  & result
-                )
-    {
-      pointer_type result_ptr = result.ptr_on_device();
-
-      if ( ! result_ptr ) {
-        result_ptr = (pointer_type)
-          Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
-      }
-
-      reference_type update = ValueInit::init( functor , result_ptr );
-
-      const typename PType::member_type e = policy.end();
-      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
-        functor( i , update );
-      }
-
-      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
-    }
-
-  // Work tag is non-void
-  template< class ViewType , class PType >
-  ParallelReduce( typename Impl::enable_if<
-                    ( Impl::is_view< ViewType >::value &&
-                      Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
-                      Impl::is_same< PType , Policy >::value &&
-                      ! Impl::is_same< typename PType::work_tag , void >::value
-                    ), const FunctorType & >::type functor
-                , const PType     & policy
-                , const ViewType  & result
-                )
-    {
-      pointer_type result_ptr = result.ptr_on_device();
-
-      if ( ! result_ptr ) {
-        result_ptr = (pointer_type)
-          Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
-      }
-
-      typename ValueTraits::reference_type update = ValueInit::init( functor , result_ptr );
-
-      const typename PType::member_type e = policy.end();
-      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
-        functor( typename PType::work_tag() , i , update );
-      }
-
-      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
-    }
-};
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
-
-public:
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  // work tag is void
-  template< class PType >
-  inline
-  ParallelScan( typename Impl::enable_if<
-                 ( Impl::is_same< PType , Policy >::value &&
-                   Impl::is_same< typename PType::work_tag , void >::value
-                 ), const FunctorType & >::type functor
-             , const PType & policy )
-    {
-      pointer_type result_ptr = (pointer_type)
-        Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
-
-      reference_type update = ValueInit::init( functor , result_ptr );
-
-      const typename PType::member_type e = policy.end();
-      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
-        functor( i , update , true );
-      }
-
-      Kokkos::Impl::FunctorFinal<  FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
-    }
-
-  // work tag is non-void
-  template< class PType >
-  inline
-  ParallelScan( typename Impl::enable_if<
-                 ( Impl::is_same< PType , Policy >::value &&
-                   ! Impl::is_same< typename PType::work_tag , void >::value
-                 ), const FunctorType & >::type functor
-             , const PType & policy )
-    {
-      pointer_type result_ptr = (pointer_type)
-        Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
-
-      reference_type update = ValueInit::init( functor , result_ptr );
-
-      const typename PType::member_type e = policy.end();
-      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
-        functor( typename PType::work_tag() , i , update , true );
-      }
-
-      Kokkos::Impl::FunctorFinal<  FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
-    }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class Arg0 , class Arg1 >
-class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
-{
-private:
-
-  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
-                 const FunctorType & >::type functor
-             , const typename Policy::member_type & member )
-    { functor( member ); }
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
-                 const FunctorType & >::type functor
-             , const typename Policy::member_type & member )
-    { functor( TagType() , member ); }
-
-public:
-
-  ParallelFor( const FunctorType & functor
-             , const Policy      & policy )
-    {
-      const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
-
-      Kokkos::Serial::scratch_memory_resize( 0 , shared_size );
-
-      for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
-        ParallelFor::template driver< typename Policy::work_tag >
-          ( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) );
-        // functor( typename Policy::member_type(ileague,policy.league_size(),shared_size) );
-      }
-    }
-};
-
-template< class FunctorType , class Arg0 , class Arg1 >
-class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
-{
-private:
-
-  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag >  ValueInit ;
-
-public:
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-private:
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
-                 const FunctorType & >::type functor
-             , const typename Policy::member_type  & member
-             ,       reference_type                  update )
-    { functor( member , update ); }
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
-                 const FunctorType & >::type functor
-             , const typename Policy::member_type  & member
-             ,       reference_type                  update )
-    { functor( TagType() , member , update ); }
-
-public:
-
-  template< class ViewType >
-  ParallelReduce( const FunctorType  & functor
-                , const Policy       & policy
-                , const ViewType     & result
-                )
-    {
-      const int reduce_size = ValueTraits::value_size( functor );
-      const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
-      void * const scratch_reduce = Kokkos::Serial::scratch_memory_resize( reduce_size , shared_size );
-
-      const pointer_type result_ptr =
-        result.ptr_on_device() ? result.ptr_on_device()
-                               : (pointer_type) scratch_reduce ;
-
-      reference_type update = ValueInit::init( functor , result_ptr );
-
-      for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
-        ParallelReduce::template driver< typename Policy::work_tag >
-          ( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) , update );
-      }
-
-      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
-    }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#ifdef KOKKOS_HAVE_CXX11
-
-namespace Kokkos {
-
-namespace Impl {
-  template<typename iType>
-  struct TeamThreadLoopBoundariesStruct<iType,SerialTeamMember> {
-    typedef iType index_type;
-    enum {start = 0};
-    const iType end;
-    enum {increment = 1};
-    const SerialTeamMember& thread;
-
-    KOKKOS_INLINE_FUNCTION
-    TeamThreadLoopBoundariesStruct (const SerialTeamMember& thread_, const iType& count):
-      end(count),
-      thread(thread_)
-    {}
-  };
-
-  template<typename iType>
-  struct ThreadVectorLoopBoundariesStruct<iType,SerialTeamMember> {
-    typedef iType index_type;
-    enum {start = 0};
-    const iType end;
-    enum {increment = 1};
-
-    KOKKOS_INLINE_FUNCTION
-    ThreadVectorLoopBoundariesStruct (const SerialTeamMember& thread, const iType& count):
-      end( count )
-    {}
-  };
-} // namespace Impl
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadLoopBoundariesStruct<iType,Impl::SerialTeamMember>
-  TeamThreadLoop(const Impl::SerialTeamMember& thread, const iType& count) {
-  return Impl::TeamThreadLoopBoundariesStruct<iType,Impl::SerialTeamMember>(thread,count);
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::SerialTeamMember >
-  ThreadVectorLoop(const Impl::SerialTeamMember& thread, const iType& count) {
-  return Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) {
-  return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread);
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadLoopBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadLoopBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
-                     const Lambda & lambda, ValueType& result) {
-
-  result = ValueType();
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-
-  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadLoopBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
-                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-
-  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
-}
-
-} //namespace Kokkos
-
-namespace Kokkos {
-/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
- * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::SerialTeamMember >&
-    loop_boundaries, const Lambda& lambda) {
-  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
-  #pragma ivdep
-  #endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
-  result = ValueType();
-#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  init_result = result;
-}
-
-/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
- *          for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
- * Depending on the target execution space the operator might be called twice: once with final=false
- * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
- * "i" needs to be added to val no matter whether final==true or not. In a serial execution
- * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
- * to the final sum value over all vector lanes.
- * This functionality requires C++11 support.*/
-template< typename iType, class FunctorType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const FunctorType & lambda) {
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
-  typedef typename ValueTraits::value_type value_type ;
-
-  value_type scan_val = value_type();
-
-#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i,scan_val,true);
-  }
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-}
-#endif // KOKKOS_HAVE_CXX11
-
-#endif // defined( KOKKOS_HAVE_SERIAL )
-#endif /* #define KOKKOS_SERIAL_HPP */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
--- a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
@ -1,467 +0,0 @@
-
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-// Experimental unified task-data parallel manycore LDRD
-
-#ifndef KOKKOS_TASKPOLICY_HPP
-#define KOKKOS_TASKPOLICY_HPP
-
-#include <Kokkos_Core_fwd.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
-#include <impl/Kokkos_StaticAssert.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-struct FutureValueTypeIsVoidError {};
-
-template < class ExecSpace , class ResultType , class FunctorType >
-class TaskMember ;
-
-template< class ExecPolicy , class ResultType , class FunctorType >
-class TaskForEach ;
-
-template< class ExecPolicy , class ResultType , class FunctorType >
-class TaskReduce ;
-
-template< class ExecPolicy , class ResultType , class FunctorType >
-struct TaskScan ;
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/**\brief  States of a task */
-enum TaskState
-  { TASK_STATE_NULL         = 0  ///<  Does not exist
-  , TASK_STATE_CONSTRUCTING = 1  ///<  Is under construction
-  , TASK_STATE_WAITING      = 2  ///<  Is waiting for execution
-  , TASK_STATE_EXECUTING    = 4  ///<  Is executing
-  , TASK_STATE_COMPLETE     = 8  ///<  Execution is complete
-  };
-
-template< class Arg1 = void , class Arg2 = void >
-class FutureArray ;
-
-/**
- *
- *  Future< space >  // value_type == void
- *  Future< value >  // space == Default
- *  Future< value , space >
- *
- */
-template< class Arg1 = void , class Arg2 = void >
-class Future {
-private:
-
-  template< class , class , class > friend class Impl::TaskMember ;
-  template< class > friend class TaskPolicy ;
-  template< class , class > friend class Future ;
-  template< class , class > friend class FutureArray ;
-
-  // Argument #2, if not void, must be the space.
-  enum { Arg1_is_space  = Impl::is_execution_space< Arg1 >::value };
-  enum { Arg2_is_space  = Impl::is_execution_space< Arg2 >::value };
-  enum { Arg2_is_void   = Impl::is_same< Arg2 , void >::value };
-
-  struct ErrorNoExecutionSpace {};
-
-  enum { Opt1  =   Arg1_is_space && Arg2_is_void
-       , Opt2  = ! Arg1_is_space && Arg2_is_void
-       , Opt3  = ! Arg1_is_space && Arg2_is_space
-       , OptOK = Impl::StaticAssert< Opt1 || Opt2 || Opt3 , ErrorNoExecutionSpace >::value
-       };
-
-  typedef typename
-    Impl::if_c< Opt2 || Opt3 , Arg1 , void >::type
-      ValueType ;
-
-  typedef typename
-    Impl::if_c< Opt1 , Arg1 , typename
-    Impl::if_c< Opt2 , Kokkos::DefaultExecutionSpace , typename
-    Impl::if_c< Opt3 , Arg2 , void
-    >::type >::type >::type
-      ExecutionSpace ;
-
-  typedef Impl::TaskMember< ExecutionSpace , void , void >       TaskRoot ;
-  typedef Impl::TaskMember< ExecutionSpace , ValueType , void >  TaskValue ;
-
-  TaskRoot * m_task ;
-
-public:
-
-  typedef ValueType       value_type;
-  typedef ExecutionSpace  execution_space ;
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  TaskState get_task_state() const
-    { return 0 != m_task ? m_task->get_state() : TASK_STATE_NULL ; }
-
-  //----------------------------------------
-
-  explicit
-  Future( TaskRoot * task )
-    : m_task(0)
-    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( task ) ); }
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  ~Future() { TaskRoot::assign( & m_task , 0 , true /* no_throw */ ); }
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  Future() : m_task(0) {}
-
-  KOKKOS_INLINE_FUNCTION
-  Future( const Future & rhs )
-    : m_task(0)
-    { TaskRoot::assign( & m_task , rhs.m_task ); }
-
-  KOKKOS_INLINE_FUNCTION
-  Future & operator = ( const Future & rhs )
-    { TaskRoot::assign( & m_task , rhs.m_task ); return *this ; }
-
-  //----------------------------------------
-
-  template< class A1 , class A2 >
-  KOKKOS_INLINE_FUNCTION
-  Future( const Future<A1,A2> & rhs )
-    : m_task(0)
-    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); }
-
-  template< class A1 , class A2 >
-  KOKKOS_INLINE_FUNCTION
-  Future & operator = ( const Future<A1,A2> & rhs )
-    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); return *this ; }
-
-  //----------------------------------------
-
-  typedef typename TaskValue::get_result_type get_result_type ;
-
-  KOKKOS_INLINE_FUNCTION
-  get_result_type get() const
-    { return static_cast<TaskValue*>( m_task )->get(); }
-};
-
-namespace Impl {
-
-template< class T >
-struct is_future : public Kokkos::Impl::bool_< false > {};
-
-template< class Arg0 , class Arg1 >
-struct is_future< Kokkos::Future<Arg0,Arg1> > : public Kokkos::Impl::bool_< true > {};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template< class Arg1 , class Arg2 >
-class FutureArray {
-private:
-
-  typedef Future<Arg1,Arg2> future_type ;
-
-  typedef typename future_type::execution_space  ExecutionSpace ;
-  typedef typename ExecutionSpace::memory_space  MemorySpace ;
-
-  typedef Impl::TaskMember< ExecutionSpace , void , void >  TaskRoot ;
-
-  future_type * m_future ;
-
-  //----------------------------------------
-
-public:
-
-  typedef ExecutionSpace  execution_space ;
-  typedef future_type     value_type ;
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  size_t size() const
-    { return m_future ? reinterpret_cast<size_t>(m_future->m_task) : size_t(0) ; }
-
-  KOKKOS_INLINE_FUNCTION
-  value_type & operator[]( const int i ) const
-    { return m_future[i+1]; }
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  ~FutureArray()
-    {
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      if ( m_future ) {
-        const size_t n = size();
-        for ( size_t i = 1 ; i <= n ; ++i ) {
-          TaskRoot::assign( & m_future[i].m_task , 0 );
-        }
-        m_future[0].m_task = 0 ;
-        MemorySpace::decrement( m_future );
-      }
-#endif
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  FutureArray() : m_future(0) {}
-
-  inline
-  FutureArray( const size_t n )
-    : m_future(0)
-    {
-      if ( n ) {
-        m_future = (future_type *) MemorySpace::allocate( "FutureArray" , sizeof(future_type) * ( n + 1 ) );
-        for ( size_t i = 0 ; i <= n ; ++i ) m_future[i].m_task = 0 ;
-      }
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  FutureArray( const FutureArray & rhs )
-    : m_future( rhs.m_future )
-    {
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      MemorySpace::increment( m_future );
-#endif
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  FutureArray & operator = ( const FutureArray & rhs )
-    {
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      MemorySpace::decrement( m_future );
-      MemorySpace::increment( rhs.m_future );
-#endif
-      m_future = rhs.m_future ;
-      return *this ;
-    }
-};
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/** \brief  If the argument is an execution space then a serial task in that space */
-template< class Arg0 = Kokkos::DefaultExecutionSpace >
-class TaskPolicy {
-public:
-
-  typedef typename Arg0::execution_space  execution_space ;
-
-  //----------------------------------------
-  /** \brief  Create a serial task with storage for dependences.
-   *
-   *  Postcondition: Task is in the 'constructing' state.
-   */
-  template< class FunctorType >
-  Future< typename FunctorType::value_type , execution_space >
-  create( const FunctorType & functor
-        , const unsigned      dependence_capacity /* = default */ ) const ;
-
-  /** \brief  Create a foreach task with storage for dependences. */
-  template< class ExecPolicy , class FunctorType >
-  Future< typename FunctorType::value_type , execution_space >
-  create_foreach( const ExecPolicy  & policy
-                , const FunctorType & functor
-                , const unsigned      dependence_capacity /* = default */ ) const ;
-
-  /** \brief  Create a reduce task with storage for dependences. */
-  template< class ExecPolicy , class FunctorType >
-  Future< typename FunctorType::value_type , execution_space >
-  create_reduce( const ExecPolicy  & policy
-               , const FunctorType & functor
-               , const unsigned      dependence_capacity /* = default */ ) const ;
-
-  /** \brief  Create a scan task with storage for dependences. */
-  template< class ExecPolicy , class FunctorType >
-  Future< typename FunctorType::value_type , execution_space >
-  create_scan( const ExecPolicy  & policy
-             , const FunctorType & functor
-             , const unsigned      dependence_capacity /* = default */ ) const ;
-
-  /** \brief  Set dependence that 'after' cannot start execution
-   *          until 'before' has completed.
-   *
-   *  Precondition: The 'after' task must be in then 'Constructing' state.
-   */
-  template< class TA , class TB >
-  void set_dependence( const Future<TA,execution_space> & after
-                     , const Future<TB,execution_space> & before ) const ;
-
-  /** \brief  Spawn a task in the 'Constructing' state
-   *
-   *  Precondition:  Task is in the 'constructing' state.
-   *  Postcondition: Task is waiting, executing, or complete.
-   */
-  template< class T >
-  const Future<T,execution_space> &
-  spawn( const Future<T,execution_space> & ) const ;
-
-  //----------------------------------------
-  /** \brief  Query dependence of an executing task */
-
-  template< class FunctorType >
-  Future< execution_space >
-  get_dependence( FunctorType * , const int ) const ;
-
-  //----------------------------------------
-  /** \brief  Clear current dependences of an executing task
-   *          in preparation for setting new dependences and
-   *          respawning.
-   *
-   * Precondition: The functor must be a task in the executing state.
-   */
-  template< class FunctorType >
-  void clear_dependence( FunctorType * ) const ;
-
-  /** \brief  Set dependence that 'after' cannot start execution
-   *          until 'before' has completed.
-   *
-   *  The 'after' functor must be in the executing state
-   */
-  template< class FunctorType , class TB >
-  void set_dependence( FunctorType * after
-                     , const Future<TB,execution_space> & before ) const ;
-
-  /** \brief  Respawn (reschedule) an executing task to be called again
-   *          after all dependences have completed.
-   */
-  template< class FunctorType >
-  void respawn( FunctorType * ) const ;
-};
-
-//----------------------------------------------------------------------------
-/** \brief  Create and spawn a single-thread task */
-template< class ExecSpace , class FunctorType >
-inline
-Future< typename FunctorType::value_type , ExecSpace >
-spawn( TaskPolicy<ExecSpace> & policy , const FunctorType & functor )
-{ return policy.spawn( policy.create( functor ) ); }
-
-/** \brief  Create and spawn a single-thread task with dependences */
-template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 >
-inline
-Future< typename FunctorType::value_type , ExecSpace >
-spawn( TaskPolicy<ExecSpace>   & policy
-     , const FunctorType       & functor
-     , const Future<Arg0,Arg1> & before_0
-     , const Future<Arg0,Arg1> & before_1 )
-{
-  Future< typename FunctorType::value_type , ExecSpace > f ;
-  f = policy.create( functor , 2 );
-  policy.add_dependence( f , before_0 );
-  policy.add_dependence( f , before_1 );
-  policy.spawn( f );
-  return f ;
-}
-
-//----------------------------------------------------------------------------
-/** \brief  Create and spawn a parallel_for task */
-template< class ExecSpace , class ParallelPolicyType , class FunctorType >
-inline
-Future< typename FunctorType::value_type , ExecSpace >
-spawn_foreach( TaskPolicy<ExecSpace>     & task_policy
-             , const ParallelPolicyType  & parallel_policy
-             , const FunctorType         & functor )
-{ return task_policy.spawn( task_policy.create_foreach( parallel_policy , functor ) ); }
-
-/** \brief  Create and spawn a parallel_reduce task */
-template< class ExecSpace , class ParallelPolicyType , class FunctorType >
-inline
-Future< typename FunctorType::value_type , ExecSpace >
-spawn_reduce( TaskPolicy<ExecSpace>     & task_policy
-            , const ParallelPolicyType  & parallel_policy
-            , const FunctorType         & functor )
-{ return task_policy.spawn( task_policy.create_reduce( parallel_policy , functor ) ); }
-
-//----------------------------------------------------------------------------
-/** \brief  Respawn a task functor with dependences */
-template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 >
-inline
-void respawn( TaskPolicy<ExecSpace>   & policy
-            , FunctorType *             functor
-            , const Future<Arg0,Arg1> & before_0
-            , const Future<Arg0,Arg1> & before_1
-            )
-{
-  policy.clear_dependence( functor );
-  policy.add_dependence( functor , before_0 );
-  policy.add_dependence( functor , before_1 );
-  policy.respawn( functor );
-}
-
-//----------------------------------------------------------------------------
-
-template< class ExecSpace >
-void wait( TaskPolicy< ExecSpace > & );
-
-template< class A0 , class A1 >
-inline
-void wait( const Future<A0,A1> & future )
-{
-  wait( Future< void , typename Future<A0,A1>::execution_space >( future ) );
-}
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_TASKPOLICY_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@ -1,214 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_THREADS_HPP
-#define KOKKOS_THREADS_HPP
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-
-#include <cstddef>
-#include <iosfwd>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_ScratchSpace.hpp>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-class ThreadsExec ;
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/** \brief  Execution space for a pool of Pthreads or C11 threads on a CPU. */
-class Threads {
-public:
-  //! \name Type declarations that all Kokkos devices must provide.
-  //@{
-  //! Tag this class as a kokkos execution space
-  typedef Threads                  execution_space ;
-  typedef Kokkos::HostSpace        memory_space ;
-  typedef Kokkos::LayoutRight      array_layout ;
-  typedef memory_space::size_type  size_type ;
-
-  typedef ScratchMemorySpace< Threads >  scratch_memory_space ;
-
-  //! For backward compatibility
-  typedef Threads                  device_type ;
-
-  //@}
-  /*------------------------------------------------------------------------*/
-  //! \name Static functions that all Kokkos devices must implement.
-  //@{
-
-  /// \brief True if and only if this method is being called in a
-  ///   thread-parallel function.
-  static int in_parallel();
-
-  /** \brief  Set the device in a "sleep" state.
-   *
-   * This function sets the device in a "sleep" state in which it is
-   * not ready for work.  This may consume less resources than if the
-   * device were in an "awake" state, but it may also take time to
-   * bring the device from a sleep state to be ready for work.
-   *
-   * \return True if the device is in the "sleep" state, else false if
-   *   the device is actively working and could not enter the "sleep"
-   *   state.
-   */
-  static bool sleep();
-
-  /// \brief Wake the device from the 'sleep' state so it is ready for work.
-  ///
-  /// \return True if the device is in the "ready" state, else "false"
-  ///  if the device is actively working (which also means that it's
-  ///  awake).
-  static bool wake();
-
-  /// \brief Wait until all dispatched functors complete.
-  ///
-  /// The parallel_for or parallel_reduce dispatch of a functor may
-  /// return asynchronously, before the functor completes.  This
-  /// method does not return until all dispatched functors on this
-  /// device have completed.
-  static void fence();
-
-  /// \brief Free any resources being consumed by the device.
-  ///
-  /// For the Threads device, this terminates spawned worker threads.
-  static void finalize();
-
-  /// \brief Print configuration information to the given output stream.
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  //@}
-  /*------------------------------------------------------------------------*/
-  /*------------------------------------------------------------------------*/
-  //! \name Space-specific functions
-  //@{
-
-  /** \brief Initialize the device in the "ready to work" state.
-   *
-   *  The device is initialized in a "ready to work" or "awake" state.
-   *  This state reduces latency and thus improves performance when
-   *  dispatching work.  However, the "awake" state consumes resources
-   *  even when no work is being done.  You may call sleep() to put
-   *  the device in a "sleeping" state that does not consume as many
-   *  resources, but it will take time (latency) to awaken the device
-   *  again (via the wake()) method so that it is ready for work.
-   *
-   *  Teams of threads are distributed as evenly as possible across
-   *  the requested number of numa regions and cores per numa region.
-   *  A team will not be split across a numa region.
-   *
-   *  If the 'use_' arguments are not supplied the hwloc is queried
-   *  to use all available cores.
-   */
-  static void initialize( unsigned threads_count = 1 ,
-                          unsigned use_numa_count = 0 ,
-                          unsigned use_cores_per_numa = 0 ,
-                          bool allow_asynchronous_threadpool = false );
-
-  static int is_initialized();
-
-  static Threads & instance( int = 0 );
-
-  //----------------------------------------
-
-  static int thread_pool_size( int depth = 0 );
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-  static int thread_pool_rank();
-#else
-  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
-#endif
-
-  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
-  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
-
-  //@}
-  //----------------------------------------
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-template<>
-struct VerifyExecutionCanAccessMemorySpace
-  < Kokkos::Threads::memory_space
-  , Kokkos::Threads::scratch_memory_space
-  >
-{
-  enum { value = true };
-  inline static void verify( void ) { }
-  inline static void verify( const void * ) { }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-#include <Kokkos_ExecPolicy.hpp>
-#include <Kokkos_Parallel.hpp>
-#include <Threads/Kokkos_ThreadsExec.hpp>
-#include <Threads/Kokkos_Threads_Parallel.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
-#endif /* #define KOKKOS_THREADS_HPP */
-
-
--- a/lib/kokkos/core/src/Kokkos_Vectorization.hpp
+++ b/lib/kokkos/core/src/Kokkos_Vectorization.hpp
@ -1,100 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Kokkos_Vectorization.hpp
-/// \brief Declaration and definition of Kokkos::Vectorization interface.
-#ifndef KOKKOS_VECTORIZATION_HPP
-#define KOKKOS_VECTORIZATION_HPP
-
-#include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_ExecPolicy.hpp>
-
-namespace Kokkos {
-template<class Space, int N>
-struct Vectorization {
-  typedef Kokkos::TeamPolicy< Space >        team_policy ;
-  typedef typename team_policy::member_type  team_member ;
-
-  enum {increment = 1};
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  static int begin() { return 0;}
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  static int thread_rank(const team_member &dev) {
-    return dev.team_rank();
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  static int team_rank(const team_member &dev) {
-    return dev.team_rank()/increment;
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  static int team_size(const team_member &dev) {
-    return dev.team_size()/increment;
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  static int global_thread_rank(const team_member &dev) {
-    return (dev.league_rank()*dev.team_size()+dev.team_rank());
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  static bool is_lane_0(const team_member &dev) {
-    return true;
-  }
-
-  template<class Scalar>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar reduce(const Scalar& val) {
-    return val;
-  }
-};
-}
-
-#if defined( KOKKOS_HAVE_CUDA )
-#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
-#endif
-
-#endif
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
--- a/lib/kokkos/core/src/Kokkos_hwloc.hpp
+++ b/lib/kokkos/core/src/Kokkos_hwloc.hpp
@ -1,140 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_HWLOC_HPP
-#define KOKKOS_HWLOC_HPP
-
-#include <utility>
-
-namespace Kokkos {
-
-/** \brief  Minimal subset of logical 'hwloc' functionality available
- *          from http://www.open-mpi.org/projects/hwloc/.
- *
- *  The calls are NOT thread safe in order to avoid mutexes,
- *  memory allocations, or other actions which could give the
- *  runtime system an opportunity to migrate the threads or
- *  touch allocated memory during the function calls.
- *
- *  All calls to these functions should be performed by a thread
- *  when it has guaranteed exclusive access; e.g., for OpenMP
- *  within a 'critical' region.
- */
-namespace hwloc {
-
-/** \brief  Query if hwloc is available */
-bool available();
-
-/** \brief  Query number of available NUMA regions.
- *          This will be less than the hardware capacity
- *          if the MPI process is pinned to a NUMA region.
- */
-unsigned get_available_numa_count();
-
-/** \brief  Query number of available cores per NUMA regions.
- *          This will be less than the hardware capacity
- *          if the MPI process is pinned to a set of cores.
- */
-unsigned get_available_cores_per_numa();
-
-/** \brief  Query number of available "hard" threads per core; i.e., hyperthreads */
-unsigned get_available_threads_per_core();
-
-} /* namespace hwloc */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-// Internal functions for binding persistent spawned threads.
-
-namespace Kokkos {
-namespace hwloc {
-
-/** \brief  Recommend mapping of threads onto cores.
- *
- * If thread_count == 0 then choose and set a value.
- * If use_numa_count == 0 then choose and set a value.
- * If use_cores_per_numa == 0 then choose and set a value.
- *
- * Return 0 if asynchronous,
- * Return 1 if synchronous and threads_coord[0] is process core
- */
-unsigned thread_mapping( const char * const label ,
-                         const bool allow_async ,
-                         unsigned & thread_count ,
-                         unsigned & use_numa_count ,
-                         unsigned & use_cores_per_numa ,
-                         std::pair<unsigned,unsigned> threads_coord[] );
-
-/** \brief  Query core-coordinate of the current thread
- *          with respect to the core_topology.
- *
- *  As long as the thread is running within the 
- *  process binding the following condition holds.
- *
- *  core_coordinate.first  < core_topology.first
- *  core_coordinate.second < core_topology.second
- */
-std::pair<unsigned,unsigned> get_this_thread_coordinate();
-
-/** \brief  Bind the current thread to a core. */
-bool bind_this_thread( const std::pair<unsigned,unsigned> );
-
-/** \brief  Bind the current thread to one of the cores in the list.
- *          Set that entry to (~0,~0) and return the index.
- *          If binding fails return ~0.
- */
-unsigned bind_this_thread( const unsigned               coordinate_count ,
-                           std::pair<unsigned,unsigned> coordinate[] );
-
-/** \brief  Unbind the current thread back to the original process binding */
-bool unbind_this_thread();
-
-} /* namespace hwloc */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_HWLOC_HPP */
-
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@ -1,496 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_OPENMP_PARALLEL_HPP
-#define KOKKOS_OPENMP_PARALLEL_HPP
-
-#include <omp.h>
-
-#include <Kokkos_Parallel.hpp>
-#include <OpenMP/Kokkos_OpenMPexec.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
-                 const FunctorType & >::type functor
-             , const PType & range )
-    {
-      const typename PType::member_type work_end = range.end();
-      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
-        functor( iwork );
-      }
-    }
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
-                 const FunctorType & >::type functor
-             , const PType & range )
-    {
-      const typename PType::member_type work_end = range.end();
-      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
-        functor( typename PType::work_tag() , iwork );
-      }
-    }
-
-public:
-
-  inline
-  ParallelFor( const FunctorType & functor
-             , const Policy      & policy )
-    {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
-
-#pragma omp parallel
-      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        driver( functor , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() ) );
-      }
-/* END #pragma omp parallel */
-    }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
-  typedef typename Policy::work_tag                                  WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
-  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
-                 const FunctorType & >::type functor
-             , reference_type update
-             , const PType & range )
-    {
-      const typename PType::member_type work_end = range.end();
-      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
-        functor( iwork , update );
-      }
-    }
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
-                 const FunctorType & >::type functor
-             , reference_type update
-             , const PType & range )
-    {
-      const typename PType::member_type work_end = range.end();
-      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
-        functor( typename PType::work_tag() , iwork , update );
-      }
-    }
-
-public:
-
-  //----------------------------------------
-
-  template< class ViewType >
-  inline
-  ParallelReduce( typename Impl::enable_if<
-                    ( Impl::is_view< ViewType >::value &&
-                      Impl::is_same< typename ViewType::memory_space , HostSpace >::value
-                    ), const FunctorType & >::type functor
-                , const Policy    & policy
-                , const ViewType  & result_view )
-  {
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
-
-    OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , 0 );
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-
-      driver( functor
-            , ValueInit::init( functor , exec.scratch_reduce() )
-            , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
-            );
-    }
-/* END #pragma omp parallel */
-
-    {
-      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
-
-      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
-      }
-
-      Kokkos::Impl::FunctorFinal<  FunctorType , WorkTag >::final( functor , ptr );
-
-      if ( result_view.ptr_on_device() ) {
-        const int n = ValueTraits::value_count( functor );
-
-        for ( int j = 0 ; j < n ; ++j ) { result_view.ptr_on_device()[j] = ptr[j] ; }
-      }
-    }
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
-  typedef typename Policy::work_tag                                  WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
-  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , WorkTag >  ValueOps ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
-                 const FunctorType & >::type functor
-             , reference_type update
-             , const PType & range
-             , const bool    final )
-    {
-      const typename PType::member_type work_end = range.end();
-      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
-        functor( iwork , update , final );
-      }
-    }
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
-                 const FunctorType & >::type functor
-             , reference_type update
-             , const PType & range
-             , const bool    final )
-    {
-      const typename PType::member_type work_end = range.end();
-      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
-        functor( typename PType::work_tag() , iwork , update , final );
-      }
-    }
-
-public:
-
-  //----------------------------------------
-
-  inline
-  ParallelScan( const FunctorType & functor
-              , const Policy      & policy )
-  {
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
-    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
-
-    OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( functor ) , 0 );
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-
-      driver( functor
-            , ValueInit::init( functor , pointer_type( exec.scratch_reduce() ) + ValueTraits::value_count( functor ) )
-            , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
-            , false );
-    }
-/* END #pragma omp parallel */
-
-    {
-      const unsigned thread_count = OpenMPexec::pool_size();
-      const unsigned value_count  = ValueTraits::value_count( functor );
-
-      pointer_type ptr_prev = 0 ;
-
-      for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
-
-        pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
-
-        if ( ptr_prev ) {
-          for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
-          ValueJoin::join( functor , ptr + value_count , ptr );
-        }
-        else {
-          ValueInit::init( functor , ptr );
-        }
-
-        ptr_prev = ptr ;
-      }
-    }
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-
-      driver( functor
-            , ValueOps::reference( pointer_type( exec.scratch_reduce() ) )
-            , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
-            , true );
-    }
-/* END #pragma omp parallel */
-
-  }
-
-  //----------------------------------------
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class Arg0 , class Arg1 >
-class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
-{
-private:
-
-  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ;
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
-                 const FunctorType & >::type functor
-             , const typename Policy::member_type  & member )
-    { functor( member ); }
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
-                 const FunctorType & >::type functor
-             , const typename Policy::member_type  & member )
-    { functor( TagType() , member ); }
-
-public:
-
-  inline
-  ParallelFor( const FunctorType & functor ,
-               const Policy      & policy )
-  {
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
-    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
-
-    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
-    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
-
-    OpenMPexec::resize_scratch( 0 , team_reduce_size + team_shmem_size );
-
-#pragma omp parallel
-    {
-      typename Policy::member_type member( * OpenMPexec::get_thread_omp() , policy , team_shmem_size );
-
-      for ( ; member.valid() ; member.next() ) {
-        ParallelFor::template driver< typename Policy::work_tag >( functor , member );
-      }
-    }
-/* END #pragma omp parallel */
-  }
-
-  void wait() {}
-};
-
-
-template< class FunctorType , class Arg0 , class Arg1 >
-class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
-{
-private:
-
-  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >         Policy ;
-  typedef typename Policy::work_tag                                  WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
-  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
-                 const FunctorType & >::type functor
-             , const typename PType::member_type  & member
-             ,       reference_type update )
-    { functor( member , update ); }
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
-                 const FunctorType & >::type functor
-             , const typename PType::member_type  & member
-             ,       reference_type update )
-    { functor( typename PType::work_tag() , member , update ); }
-
-public:
-
-  inline
-  ParallelReduce( const FunctorType  & functor ,
-                  const Policy       & policy )
-  {
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-
-    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
-    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
-
-    OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-
-      reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
-
-      for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
-        ParallelReduce::template driver< Policy >( functor , member , update );
-      }
-    }
-/* END #pragma omp parallel */
-
-    {
-      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag , reference_type >  Join ;
-
-      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
-
-      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        Join::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
-      }
-
-      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
-    }
-  }
-
-  template< class ViewType >
-  inline
-  ParallelReduce( const FunctorType  & functor ,
-                  const Policy       & policy ,
-                  const ViewType     & result )
-  {
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-
-    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
-    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
-
-    OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-
-      reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
-
-      for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
-        ParallelReduce::template driver< Policy >( functor , member , update );
-      }
-    }
-/* END #pragma omp parallel */
-
-    {
-      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
-
-      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
-      }
-
-      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
-
-      const int n = ValueTraits::value_count( functor );
-
-      for ( int j = 0 ; j < n ; ++j ) { result.ptr_on_device()[j] = ptr[j] ; }
-    }
-  }
-
-  void wait() {}
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
-
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@ -1,365 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <stdio.h>
-#include <limits>
-#include <iostream>
-#include <vector>
-#include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Error.hpp>
-#include <iostream>
-
-#ifdef KOKKOS_HAVE_OPENMP
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-KOKKOS_INLINE_FUNCTION
-int kokkos_omp_in_parallel();
-
-int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
-
-KOKKOS_INLINE_FUNCTION
-int kokkos_omp_in_parallel()
-{
-#ifndef __CUDA_ARCH__
-  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
-#else
-  return 0;
-#endif
-}
-
-bool s_using_hwloc = false;
-
-} // namespace
-} // namespace Impl
-} // namespace Kokkos
-
-
-namespace Kokkos {
-namespace Impl {
-
-int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
-
-int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
-
-OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
-
-void OpenMPexec::verify_is_process( const char * const label )
-{
-  if ( omp_in_parallel() ) {
-    std::string msg( label );
-    msg.append( " ERROR: in parallel" );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-}
-
-void OpenMPexec::verify_initialized( const char * const label )
-{
-  if ( 0 == m_pool[0] ) {
-    std::string msg( label );
-    msg.append( " ERROR: not initialized" );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-}
-
-void OpenMPexec::clear_scratch()
-{
-#pragma omp parallel
-  {
-    const int rank_rev = m_map_rank[ omp_get_thread_num() ];
-
-#pragma omp critical
-    {
-      kokkos_omp_in_critical_region = 1 ;
-
-      m_pool[ rank_rev ]->~OpenMPexec();
-      HostSpace::decrement( m_pool[ rank_rev ] );
-      m_pool[ rank_rev ] = 0 ;
-
-      kokkos_omp_in_critical_region = 0 ;
-    }
-/* END #pragma omp critical */
-  }
-/* END #pragma omp parallel */
-}
-
-void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
-{
-  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
-  enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
-
-  const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
-  const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
-
-  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
-  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
-
-  // Requesting allocation and old allocation is too small:
-
-  const bool allocate = ( old_reduce_size < reduce_size ) ||
-                        ( old_thread_size < thread_size );
-
-  if ( allocate ) {
-    if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
-    if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
-  }
-
-  const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
-  const int    pool_size  = m_pool_topo[0] ;
-
-  if ( allocate ) {
-
-    clear_scratch();
-
-#pragma omp parallel
-    {
-      const int rank_rev = m_map_rank[ omp_get_thread_num() ];
-      const int rank     = pool_size - ( rank_rev + 1 );
-
-#pragma omp critical
-      {
-        kokkos_omp_in_critical_region = 1 ;
-
-        m_pool[ rank_rev ] =
-          (OpenMPexec *) HostSpace::allocate( "openmp_scratch" , alloc_size );
-        new( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
-
-        kokkos_omp_in_critical_region = 0 ;
-      }
-/* END #pragma omp critical */
-    }
-/* END #pragma omp parallel */
-  }
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-int OpenMP::is_initialized()
-{ return 0 != Impl::OpenMPexec::m_pool[0]; }
-
-void OpenMP::initialize( unsigned thread_count ,
-                         unsigned use_numa_count ,
-                         unsigned use_cores_per_numa )
-{
-  // Before any other call to OMP query the maximum number of threads
-  // and save the value for re-initialization unit testing.
-  static int omp_max_threads = omp_get_max_threads();
-
-  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
-
-  bool thread_spawn_failed = false ;
-
-  if ( ! is_initialized ) {
-
-    // Use hwloc thread pinning if concerned with locality.
-    // If spreading threads across multiple NUMA regions.
-    // If hyperthreading is enabled.
-    Impl::s_using_hwloc = hwloc::available() && (
-                            ( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
-                            ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
-
-    std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
-
-    // If hwloc available then use it's maximum value.
-
-    if ( thread_count == 0 ) {
-      thread_count = Impl::s_using_hwloc
-      ? Kokkos::hwloc::get_available_numa_count() *
-        Kokkos::hwloc::get_available_cores_per_numa() *
-        Kokkos::hwloc::get_available_threads_per_core()
-      : omp_max_threads ;
-    }
-
-    if(Impl::s_using_hwloc)
-      hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
-                           false /* do not allow asynchronous */ ,
-                           thread_count ,
-                           use_numa_count ,
-                           use_cores_per_numa ,
-                           threads_coord );
-
-    // Spawn threads:
-
-    omp_set_num_threads( thread_count );
-
-    // Verify OMP interaction:
-    if ( int(thread_count) != omp_get_max_threads() ) {
-      thread_spawn_failed = true ;
-    }
-
-    // Verify spawning and bind threads:
-#pragma omp parallel
-    {
-#pragma omp critical
-      {
-        if ( int(thread_count) != omp_get_num_threads() ) {
-          thread_spawn_failed = true ;
-        }
-
-        // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
-        // Call to 'new' may not be thread safe as well.
-
-        // Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
-
-        const unsigned omp_rank    = omp_get_thread_num();
-        const unsigned thread_r    = Impl::s_using_hwloc ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) : omp_rank ;
-
-        Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
-      }
-/* END #pragma omp critical */
-    }
-/* END #pragma omp parallel */
-
-    if ( ! thread_spawn_failed ) {
-      Impl::OpenMPexec::m_pool_topo[0] = thread_count ;
-      Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
-      Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
-
-      Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
-    }
-  }
-
-  if ( is_initialized || thread_spawn_failed ) {
-    std::string msg("Kokkos::OpenMP::initialize ERROR");
-
-    if ( is_initialized ) { msg.append(" : already initialized"); }
-    if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }
-
-    Kokkos::Impl::throw_runtime_exception(msg);
-  }
-}
-
-//----------------------------------------------------------------------------
-
-void OpenMP::finalize()
-{
-  Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
-  Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
-
-  Impl::OpenMPexec::clear_scratch();
-
-  Impl::OpenMPexec::m_pool_topo[0] = 0 ;
-  Impl::OpenMPexec::m_pool_topo[1] = 0 ;
-  Impl::OpenMPexec::m_pool_topo[2] = 0 ;
-
-  omp_set_num_threads(0);
-
-  if ( Impl::s_using_hwloc ) {
-    hwloc::unbind_this_thread();
-  }
-}
-
-//----------------------------------------------------------------------------
-
-void OpenMP::print_configuration( std::ostream & s , const bool detail )
-{
-  Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" );
-
-  s << "Kokkos::OpenMP" ;
-
-#if defined( KOKKOS_HAVE_OPENMP )
-  s << " KOKKOS_HAVE_OPENMP" ;
-#endif
-#if defined( KOKKOS_HAVE_HWLOC )
-
-  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-  s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]"
-    << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
-    ;
-#endif
-
-  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
-
-  if ( is_initialized ) {
-    const int numa_count      = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ;
-    const int core_per_numa   = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
-    const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
-
-    s << " thread_pool_topology[ " << numa_count
-      << " x " << core_per_numa
-      << " x " << thread_per_core
-      << " ]"
-      << std::endl ;
-
-    if ( detail ) {
-      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] );
-
-#pragma omp parallel
-      {
-#pragma omp critical
-        {
-          coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
-        }
-/* END #pragma omp critical */
-      }
-/* END #pragma omp parallel */
-
-      for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
-        s << "  thread omp_rank[" << i << "]"
-          << " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]"
-          << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
-          << std::endl ;
-      }
-    }
-  }
-  else {
-    s << " not initialized" << std::endl ;
-  }
-}
-
-} // namespace Kokkos
-
-#endif //KOKKOS_HAVE_OPENMP
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@ -1,758 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_OPENMPEXEC_HPP
-#define KOKKOS_OPENMPEXEC_HPP
-
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_spinwait.hpp>
-
-#include <Kokkos_Atomic.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-/** \brief  Data for OpenMP thread execution */
-
-class OpenMPexec {
-public:
-
-  enum { MAX_THREAD_COUNT = 4096 };
-
-private:
-
-  static int          m_pool_topo[ 4 ];
-  static int          m_map_rank[ MAX_THREAD_COUNT ];
-  static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
-
-  friend class Kokkos::OpenMP ;
-
-  int const  m_pool_rank ;
-  int const  m_pool_rank_rev ;
-  int const  m_scratch_exec_end ;
-  int const  m_scratch_reduce_end ;
-  int const  m_scratch_thread_end ;
-
-  int volatile  m_barrier_state ;
-
-  OpenMPexec();
-  OpenMPexec( const OpenMPexec & );
-  OpenMPexec & operator = ( const OpenMPexec & );
-
-  static void clear_scratch();
-
-public:
-
-  // Topology of a cache coherent thread pool:
-  //   TOTAL = NUMA x GRAIN
-  //   pool_size( depth = 0 )
-  //   pool_size(0) = total number of threads
-  //   pool_size(1) = number of threads per NUMA
-  //   pool_size(2) = number of threads sharing finest grain memory hierarchy
-
-  inline static
-  int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
-
-  inline static
-  OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
-
-  inline int pool_rank() const { return m_pool_rank ; }
-  inline int pool_rank_rev() const { return m_pool_rank_rev ; }
-
-  inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
-  inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
-
-  inline
-  void state_wait( int state )
-    { Impl::spinwait( m_barrier_state , state ); }
-
-  inline
-  void state_set( int state ) { m_barrier_state = state ; }
-
-  ~OpenMPexec() {}
-
-  OpenMPexec( const int poolRank 
-            , const int scratch_exec_size
-            , const int scratch_reduce_size
-            , const int scratch_thread_size )
-    : m_pool_rank( poolRank )
-    , m_pool_rank_rev( pool_size() - ( poolRank + 1 ) )
-    , m_scratch_exec_end( scratch_exec_size )
-    , m_scratch_reduce_end( m_scratch_exec_end   + scratch_reduce_size )
-    , m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size )
-    , m_barrier_state(0)
-    {}
-
-  static void finalize();
-
-  static void initialize( const unsigned  team_count ,
-                          const unsigned threads_per_team ,
-                          const unsigned numa_count ,
-                          const unsigned cores_per_numa );
-
-  static void verify_is_process( const char * const );
-  static void verify_initialized( const char * const );
-
-  static void resize_scratch( size_t reduce_size , size_t thread_size );
-
-  inline static
-  OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-class OpenMPexecTeamMember {
-private:
-
-  enum { TEAM_REDUCE_SIZE = 512 };
-
-  /** \brief  Thread states for team synchronization */
-  enum { Active = 0 , Rendezvous = 1 };
-
-  typedef Kokkos::OpenMP                         execution_space ;
-  typedef execution_space::scratch_memory_space  scratch_memory_space ;
-
-  Impl::OpenMPexec    & m_exec ;
-  scratch_memory_space  m_team_shared ;
-  int                   m_team_shmem ;
-  int                   m_team_base_rev ;
-  int                   m_team_rank_rev ;
-  int                   m_team_rank ;
-  int                   m_team_size ;
-  int                   m_league_rank ;
-  int                   m_league_end ;
-  int                   m_league_size ;
-
-  // Fan-in team threads, root of the fan-in which does not block returns true
-  inline
-  bool team_fan_in() const
-    {
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
-        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
-      }
-
-      if ( m_team_rank_rev ) {
-        m_exec.state_set( Rendezvous );
-        m_exec.state_wait( Rendezvous );
-      }
-
-      return 0 == m_team_rank_rev ;
-    }
-
-  inline
-  void team_fan_out() const
-    {
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
-        m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
-      }
-    }
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space & team_shmem() const
-    { return m_team_shared ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    {}
-#else
-    {
-      if ( 1 < m_team_size ) {
-        team_fan_in();
-        team_fan_out();
-      }
-    }
-#endif
-
-  template<class ValueType>
-  KOKKOS_INLINE_FUNCTION
-  void team_broadcast(ValueType& value, const int& thread_id) const
-  {
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { }
-#else
-    // Make sure there is enough scratch space:
-    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
-                         , ValueType , void >::type type ;
-
-    type * const local_value = ((type*) m_exec.scratch_thread());
-    if(team_rank() == thread_id)
-      *local_value = value;
-    memory_fence();
-    team_barrier();
-    value = *local_value;
-#endif
-  }
-
-#ifdef KOKKOS_HAVE_CXX11
-  template< class ValueType, class JoinOp >
-  KOKKOS_INLINE_FUNCTION ValueType
-    team_reduce( const ValueType & value
-               , const JoinOp & op_in ) const
-  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return ValueType(); }
-  #else
-    {
-      typedef ValueType value_type;
-      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
-  #endif
-#else // KOKKOS_HAVE_CXX11
-  template< class JoinOp >
-  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
-    team_reduce( const typename JoinOp::value_type & value
-               , const JoinOp & op ) const
-  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return typename JoinOp::value_type(); }
-  #else
-    {
-      typedef typename JoinOp::value_type value_type;
-  #endif
-#endif // KOKKOS_HAVE_CXX11
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      // Make sure there is enough scratch space:
-      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
-                           , value_type , void >::type type ;
-
-      type * const local_value = ((type*) m_exec.scratch_thread());
-
-      // Set this thread's contribution
-      *local_value = value ;
-
-      // Fence to make sure the base team member has access:
-      memory_fence();
-
-      if ( team_fan_in() ) {
-        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
-        type * const team_value  = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
-
-        // Join to the team value:
-        for ( int i = 1 ; i < m_team_size ; ++i ) {
-          op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
-        }
-
-        // The base team member may "lap" the other team members,
-        // copy to their local value before proceeding.
-        for ( int i = 1 ; i < m_team_size ; ++i ) {
-          *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ;
-        }
-
-        // Fence to make sure all team members have access
-        memory_fence();
-      }
-
-      team_fan_out();
-
-      return *((type volatile const *)local_value);
-    }
-#endif
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename ArgType >
-  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return ArgType(); }
-#else
-    {
-      // Make sure there is enough scratch space:
-      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
-
-      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
-
-      *work_value = value ;
-
-      memory_fence();
-
-      if ( team_fan_in() ) {
-        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
-        // m_team_base[0]                 == highest ranking team member
-        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
-        //
-        // 1) copy from lower to higher rank, initialize lowest rank to zero
-        // 2) prefix sum from lowest to highest rank, skipping lowest rank
-
-        type accum = 0 ;
-
-        if ( global_accum ) {
-          for ( int i = m_team_size ; i-- ; ) {
-            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
-            accum += val ;
-          }
-          accum = atomic_fetch_add( global_accum , accum );
-        }
-
-        for ( int i = m_team_size ; i-- ; ) {
-          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
-          const type offset = accum ;  
-          accum += val ;
-          val = offset ;
-        }
-
-        memory_fence();
-      }
-
-      team_fan_out();
-
-      return *work_value ;
-    }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
-    { return this-> template team_scan<Type>( value , 0 ); }
-
-#ifdef KOKKOS_HAVE_CXX11
-
-  /** \brief  Inter-thread parallel for. Executes op(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-  template< typename iType, class Operation>
-  KOKKOS_INLINE_FUNCTION void team_par_for(const iType n, const Operation & op) const {
-    const int chunk = ((n+m_team_size-1)/m_team_size);
-    const int start = chunk*m_team_rank;
-    const int end = start+chunk<n?start+chunk:n;
-    for(int i=start; i<end ; i++) {
-      op(i);
-    }
-  }
-#endif
-
-  //----------------------------------------
-  // Private for the driver
-
-private:
-
-  typedef execution_space::scratch_memory_space space ;
-
-public:
-
-  template< class Arg0 , class Arg1 >
-  inline
-  OpenMPexecTeamMember( Impl::OpenMPexec & exec
-                      , const TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > & team
-                      , const int shmem_size
-                      )
-    : m_exec( exec )
-    , m_team_shared(0,0)
-    , m_team_shmem( shmem_size )
-    , m_team_base_rev(0)
-    , m_team_rank_rev(0)
-    , m_team_rank(0)
-    , m_team_size( team.team_size() )
-    , m_league_rank(0)
-    , m_league_end(0)
-    , m_league_size( team.league_size() )
-    {
-      const int pool_rank_rev        = m_exec.pool_rank_rev();
-      const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
-      const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
-      const int league_iter_end      = team.league_size() - pool_league_rank_rev * team.team_iter();
-
-      if ( pool_team_rank_rev < m_team_size && 0 < league_iter_end ) {
-        m_team_base_rev  = team.team_alloc() * pool_league_rank_rev ;
-        m_team_rank_rev  = pool_team_rank_rev ;
-        m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
-        m_league_end     = league_iter_end ;
-        m_league_rank    = league_iter_end > team.team_iter() ? league_iter_end - team.team_iter() : 0 ;
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
-      }
-    }
-
-  bool valid() const
-    { return m_league_rank < m_league_end ; }
-
-  void next()
-    {
-      if ( ++m_league_rank < m_league_end ) {
-        team_barrier();
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
-      }
-    }
-
-  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
-};
-
-
-
-} // namespace Impl
-
-template< class Arg0 , class Arg1 >
-class TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >
-{
-public:
-
-  //! Tag this class as a kokkos execution policy
-  typedef TeamPolicy      execution_policy ;
-
-  //! Execution space of this execution policy.
-  typedef Kokkos::OpenMP  execution_space ;
-
-  typedef typename
-    Impl::if_c< ! Impl::is_same< Kokkos::OpenMP , Arg0 >::value , Arg0 , Arg1 >::type
-      work_tag ;
-
-  //----------------------------------------
-
-  template< class FunctorType >
-  inline static
-  int team_size_max( const FunctorType & )
-    { return execution_space::thread_pool_size(1); }
-
-  template< class FunctorType >
-  inline static
-  int team_size_recommended( const FunctorType & )
-    { return execution_space::thread_pool_size(2); }
-
-  //----------------------------------------
-
-private:
-
-  int m_league_size ;
-  int m_team_size ;
-  int m_team_alloc ;
-  int m_team_iter ;
-
-  inline void init( const int league_size_request
-                  , const int team_size_request )
-    {
-      const int pool_size  = execution_space::thread_pool_size(0);
-      const int team_max   = execution_space::thread_pool_size(1);
-      const int team_grain = execution_space::thread_pool_size(2);
-
-      m_league_size = league_size_request ;
-
-      m_team_size = team_size_request < team_max ?
-                    team_size_request : team_max ;
-
-      // Round team size up to a multiple of 'team_gain'
-      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
-      const int team_count      = pool_size / team_size_grain ;
-
-      // Constraint : pool_size = m_team_alloc * team_count
-      m_team_alloc = pool_size / team_count ;
-
-      // Maxumum number of iterations each team will take:
-      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
-    }
-
-public:
-
-  inline int team_size()   const { return m_team_size ; }
-  inline int league_size() const { return m_league_size ; }
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1)
-    { init( league_size_request , team_size_request ); (void) vector_length_request; }
-
-  TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
-    { init( league_size_request , team_size_request ); (void) vector_length_request; }
-
-  inline int team_alloc() const { return m_team_alloc ; }
-  inline int team_iter()  const { return m_team_iter ; }
-
-  typedef Impl::OpenMPexecTeamMember member_type ;
-};
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-inline
-int OpenMP::thread_pool_size( int depth )
-{
-  return Impl::OpenMPexec::pool_size(depth);
-}
-
-KOKKOS_INLINE_FUNCTION
-int OpenMP::thread_pool_rank()
-{
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-  return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ];
-#else
-  return -1 ;
-#endif
-}
-
-} // namespace Kokkos
-
-
-#ifdef KOKKOS_HAVE_CXX11
-
-namespace Kokkos {
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember>
-  TeamThreadLoop(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
-  return Impl::TeamThreadLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,count);
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember >
-  ThreadVectorLoop(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
-  return Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) {
-  return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread);
-}
-} // namespace Kokkos
-
-namespace Kokkos {
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
-                     const Lambda & lambda, ValueType& result) {
-
-  result = ValueType();
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-
-  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
-                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-
-  init_result = loop_boundaries.thread.team_reduce(result,join);
-}
-
-} //namespace Kokkos
-
-
-namespace Kokkos {
-/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
- * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-    loop_boundaries, const Lambda& lambda) {
-  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
-  #pragma ivdep
-  #endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
-  result = ValueType();
-#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  init_result = result;
-}
-
-/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
- *          for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
- * Depending on the target execution space the operator might be called twice: once with final=false
- * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
- * "i" needs to be added to val no matter whether final==true or not. In a serial execution
- * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
- * to the final sum value over all vector lanes.
- * This functionality requires C++11 support.*/
-template< typename iType, class FunctorType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const FunctorType & lambda) {
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
-  typedef typename ValueTraits::value_type value_type ;
-
-  value_type scan_val = value_type();
-
-#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i,scan_val,true);
-  }
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
-  if(single_struct.team_member.team_rank()==0) lambda();
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
-  if(single_struct.team_member.team_rank()==0) {
-    lambda(val);
-  }
-  single_struct.team_member.team_broadcast(val,0);
-}
-}
-
-#endif // KOKKOS_HAVE_CXX11
-
-#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
-
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
@ -1,380 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_HAVE_QTHREAD )
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <sstream>
-#include <utility>
-#include <Kokkos_Qthread.hpp>
-#include <Kokkos_Atomic.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-#define QTHREAD_LOCAL_PRIORITY
-
-#include <qthread/qthread.h>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
-
-/** s_exec is indexed by the reverse rank of the workers
- *  for faster fan-in / fan-out lookups
- *  [ n - 1 , n - 2 , ... , 0 ]
- */
-QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
-
-int  s_number_shepherds            = 0 ;
-int  s_number_workers_per_shepherd = 0 ;
-int  s_number_workers              = 0 ;
-
-inline
-QthreadExec ** worker_exec()
-{
-  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
-}
-
-const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
-
-int s_worker_reduce_end   = 0 ; /* End of worker reduction memory    */
-int s_worker_shared_end   = 0 ; /* Total of worker scratch memory    */
-int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
-
-QthreadExecFunctionPointer s_active_function = 0 ;
-const void               * s_active_function_arg = 0 ;
-
-} /* namespace */
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-void Qthread::initialize( int thread_count )
-{
-  // Environment variable: QTHREAD_NUM_SHEPHERDS
-  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
-  // Environment variable: QTHREAD_HWPAR
-
-  {
-    char buffer[256];
-    snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
-    putenv(buffer);
-  }
-
-  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
-                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
-                       ( thread_count    == qthread_num_workers() );
-
-  bool ok_symmetry = true ;
-
-  if ( ok_init ) {
-    Impl::s_number_shepherds            = qthread_num_shepherds();
-    Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
-    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
-
-    for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
-      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
-    }
-  }
-
-  if ( ! ok_init || ! ok_symmetry ) {
-    std::ostringstream msg ;
-
-    msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
-    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
-    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
-    msg << " : qthread_num_workers = " << qthread_num_workers();
-
-    if ( ! ok_symmetry ) {
-      msg << " : qthread_num_workers_local = {" ;
-      for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
-        msg << " " << qthread_num_workers_local(i) ;
-      }
-      msg << " }" ;
-    }
-
-    Impl::s_number_workers   = 0 ;
-    Impl::s_number_shepherds = 0 ;
-    Impl::s_number_workers_per_shepherd = 0 ;
-
-    if ( ok_init ) { qthread_finalize(); }
-
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-
-  Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
-}
-
-void Qthread::finalize()
-{
-  Impl::QthreadExec::clear_workers();
-
-  if ( Impl::s_number_workers ) {
-    qthread_finalize();
-  }
-
-  Impl::s_number_workers    = 0 ;
-  Impl::s_number_shepherds  = 0 ;
-  Impl::s_number_workers_per_shepherd = 0 ;
-}
-
-void Qthread::print_configuration( std::ostream & s , const bool detail )
-{
-  s << "Kokkos::Qthread {"
-    << " num_shepherds(" << Impl::s_number_shepherds << ")"
-    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
-    << " }" << std::endl ;
-}
-
-Qthread & Qthread::instance( int )
-{
-  static Qthread q ;
-  return q ;
-}
-
-void Qthread::fence()
-{
-}
-
-int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
-int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-aligned_t driver_exec_all( void * arg )
-{
-  (*s_active_function)( ** worker_exec() , s_active_function_arg );
-
-  return 0 ;
-}
-
-aligned_t driver_resize_worker_scratch( void * arg )
-{
-  static volatile int lock_begin = 0 ;
-  static volatile int lock_end   = 0 ;
-
-  QthreadExec ** const exec = worker_exec();
-
-  //----------------------------------------
-  // Serialize allocation for thread safety
-
-  while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
-
-  const bool ok = 0 == *exec ;
-
-  if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
-
-  lock_begin = 0 ; // release lock
-
-  if ( ok ) { new( *exec ) QthreadExec(); }
-
-  //----------------------------------------
-  // Wait for all calls to complete to insure that each worker has executed.
-
-  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
-
-  while ( lock_end );
-
-  //----------------------------------------
-
-  return 0 ;
-}
-
-void verify_is_process( const char * const label , bool not_active = false )
-{
-  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
-  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
-
-  if ( not_process || is_active ) {
-    std::string msg( label );
-    msg.append( " : FAILED" );
-    if ( not_process ) msg.append(" : not called by main process");
-    if ( is_active )   msg.append(" : parallel execution in progress");
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-}
-
-}
-
-QthreadExec::QthreadExec()
-{
-  const int shepherd_rank        = qthread_shep();
-  const int shepherd_worker_rank = qthread_worker_local(NULL);
-  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
-
-  m_worker_base          = s_exec ;
-  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
-  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size ;
-  m_reduce_end           = s_worker_reduce_end ;
-  m_shepherd_rank        = shepherd_rank ;
-  m_shepherd_size        = s_number_shepherds ;
-  m_shepherd_worker_rank = shepherd_worker_rank ;
-  m_shepherd_worker_size = s_number_workers_per_shepherd ;
-  m_worker_rank          = worker_rank ;
-  m_worker_size          = s_number_workers ;
-  m_worker_state         = QthreadExec::Active ;
-}
-
-void QthreadExec::clear_workers()
-{
-  for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
-    free( s_exec[iwork] );
-    s_exec[iwork] = 0 ;
-  }
-}
-
-void QthreadExec::shared_reset( Qthread::scratch_memory_space & space )
-{
-  new( & space )
-    Qthread::scratch_memory_space(
-      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin ,
-      s_worker_shared_end - s_worker_shared_begin
-    );
-}
-
-void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
-{
-  const int exec_all_reduce_alloc = align_alloc( reduce_size );
-  const int shepherd_scan_alloc   = align_alloc( 8 );
-  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
-
-  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
-       s_worker_shared_end < shepherd_shared_end ) {
-
-    // Clear current worker memory before allocating new worker memory
-    clear_workers();
-
-    // Increase the buffers to an aligned allocation
-    s_worker_reduce_end   = exec_all_reduce_alloc ;
-    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
-    s_worker_shared_end   = shepherd_shared_end ;
-
-    // Need to query which shepherd this main 'process' is running...
-
-    // Have each worker resize its memory for proper first-touch
-    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-    for ( int i = jshep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
-
-      // Unit tests hang with this call:
-      //
-      // qthread_fork_to_local_priority( driver_resize_workers , NULL , NULL , jshep );
-      //
-
-      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
-    }}
-
-    driver_resize_worker_scratch( NULL );
-
-    // Verify all workers allocated
-
-    bool ok = true ;
-    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
-
-    if ( ! ok ) {
-      std::ostringstream msg ;
-      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
-      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
-         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
-      }
-      msg << " }" ;
-      Kokkos::Impl::throw_runtime_exception( msg.str() );
-    }
-  }
-}
-
-void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
-{
-  verify_is_process("QthreadExec::exec_all(...)",true);
-
-  s_active_function     = func ;
-  s_active_function_arg = arg ;
-
-  // Need to query which shepherd this main 'process' is running...
- 
-  const int main_shep = qthread_shep();
-
-  for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-  for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
-
-    // Unit tests hang with this call:
-    //
-    // qthread_fork_to_local_priority( driver_exec_all , NULL , NULL , jshep );
-    //
-
-    qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
-  }}
-
-  driver_exec_all( NULL );
-
-  s_active_function     = 0 ;
-  s_active_function_arg = 0 ;
-}
-
-void * QthreadExec::exec_all_reduce_result()
-{
-  return s_exec[0]->m_scratch_alloc ;
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
-
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
@ -1,580 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_QTHREADEXEC_HPP
-#define KOKKOS_QTHREADEXEC_HPP
-
-#include <impl/Kokkos_spinwait.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-class QthreadExec ;
-
-typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
-
-class QthreadExec {
-private:
-
-  enum { Inactive = 0 , Active = 1 };
-
-  const QthreadExec * const * m_worker_base ;
-  const QthreadExec * const * m_shepherd_base ;
-
-  void  * m_scratch_alloc ;  ///< Scratch memory [ reduce , team , shared ]
-  int     m_reduce_end ;     ///< End of scratch reduction memory
-
-  int     m_shepherd_rank ;
-  int     m_shepherd_size ;
-
-  int     m_shepherd_worker_rank ;
-  int     m_shepherd_worker_size ;
-
-  /*
-   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
-   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
-   */
-  int     m_worker_rank ;
-  int     m_worker_size ;
-
-  int mutable volatile m_worker_state ;
-
-
-  friend class Kokkos::Qthread ;
-
-  ~QthreadExec();
-  QthreadExec( const QthreadExec & );
-  QthreadExec & operator = ( const QthreadExec & );
-
-public:
-
-  QthreadExec();
-
-  /** Execute the input function on all available Qthread workers */
-  static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
-
-  //----------------------------------------
-  /** Barrier across all workers participating in the 'exec_all' */
-  void exec_all_barrier() const
-    {
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-    
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  /** Barrier across workers within the shepherd with rank < team_rank */
-  void shepherd_barrier( const int team_size ) const
-    {
-      if ( m_shepherd_worker_rank < team_size ) {
-
-        const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-        int n , j ;
-
-        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-          Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-        }
-
-        if ( rev_rank ) {
-          m_worker_state = QthreadExec::Inactive ;
-          Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-        }
-    
-        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-          m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-        }
-      }
-    }
-
-  //----------------------------------------
-  /** Reduce across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ArgTag >
-  inline
-  void exec_all_reduce( const FunctorType & func ) const
-    {
-      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
-      typedef Kokkos::Impl::FunctorValueOps<  FunctorType , ArgTag > ValueOps ;
-
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        const QthreadExec & fan = *m_worker_base[j];
-
-        Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
-
-        ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-    
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  //----------------------------------------
-  /** Scall across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ArgTag >
-  inline
-  void exec_all_scan( const FunctorType & func ) const
-    {
-      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > ValueInit ;
-      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > ValueJoin ;
-      typedef Kokkos::Impl::FunctorValueOps<    FunctorType , ArgTag > ValueOps ;
-
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_worker_base[0] is the 
-        // highest ranking thread.
-
-        // Copy from lower ranking to higher ranking worker.
-        for ( int i = 1 ; i < n ; ++i ) {
-          ValueOps::copy( func , m_worker_base[i-1]->m_scratch_alloc
-                           , m_worker_base[i]->m_scratch_alloc );
-        }
-
-        ValueInit::init( func , m_worker_base[n-1]->m_scratch_alloc );
-
-        // Join from lower ranking to higher ranking worker.
-        // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
-        for ( int i = n - 1 ; --i ; ) {
-          ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
-        }
-      }
-    
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  //----------------------------------------
-
-  template< class Type>
-  inline
-  volatile Type * shepherd_team_scratch_value() const
-    { return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
-
-  template< class Type >
-  inline
-  Type shepherd_reduce( const int team_size , const Type & value ) const
-    {
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < n ; ++i ) {
-          accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-        }
-        for ( int i = 1 ; i < n ; ++i ) {
-          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  template< class JoinOp >
-  inline
-  typename JoinOp::value_type
-    shepherd_reduce( const int team_size
-                   , const typename JoinOp::value_type & value
-                   , const JoinOp & op ) const
-    {
-      typedef typename JoinOp::value_type Type ;
-
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < n ; ++i ) {
-          op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
-        }
-        for ( int i = 1 ; i < n ; ++i ) {
-          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  template< class Type >
-  inline
-  Type shepherd_scan( const int team_size
-                    , const Type & value
-                    ,       Type * const global_value = 0 ) const
-    {
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_shepherd_base[0] is the 
-        // highest ranking thread.
-
-        // Copy from lower ranking to higher ranking worker.
-
-        Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < n ; ++i ) {
-          const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-          accum += tmp ;
-          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
-        }
-
-        * m_shepherd_base[n-1]->shepherd_team_scratch_value<Type>() =
-          global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
-
-        // Join from lower ranking to higher ranking worker.
-        for ( int i = n ; --i ; ) {
-          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-        }
-
-        memory_fence();
-      }
-    
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  //----------------------------------------
-
-  static inline
-  int align_alloc( int size )
-    {
-      enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
-      enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
-      return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
-    }
-
-  void shared_reset( Qthread::scratch_memory_space & );
-
-  void * exec_all_reduce_value() const { return m_scratch_alloc ; }
-
-  static void * exec_all_reduce_result();
-
-  static void resize_worker_scratch( const int reduce_size , const int shared_size );
-  static void clear_workers();
-
-  //----------------------------------------
-
-  inline int worker_rank() const { return m_worker_rank ; }
-  inline int worker_size() const { return m_worker_size ; }
-  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
-  inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
-  inline int shepherd_rank() const { return m_shepherd_rank ; }
-  inline int shepherd_size() const { return m_shepherd_size ; }
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-class QthreadTeamPolicyMember {
-private:
-
-  typedef Kokkos::Qthread                        execution_space ;
-  typedef execution_space::scratch_memory_space  scratch_memory_space ;
-
-
-        Impl::QthreadExec   & m_exec ;
-  scratch_memory_space        m_team_shared ;
-  const int                   m_team_size ;
-  const int                   m_team_rank ;
-  const int                   m_league_size ;
-  const int                   m_league_end ;
-        int                   m_league_rank ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_shmem() const { return m_team_shared ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    {}
-#else
-    { m_exec.shepherd_barrier( m_team_size ); }
-#endif
-
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_reduce<Type>( m_team_size , value ); }
-#endif
-
-  template< typename JoinOp >
-  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
-    team_reduce( const typename JoinOp::value_type & value
-               , const JoinOp & op ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return typename JoinOp::value_type(); }
-#else
-    { return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
-#endif
-
-  //----------------------------------------
-  // Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
-
-  // Initialize
-  template< class Arg0 , class Arg1 >
-  QthreadTeamPolicyMember( Impl::QthreadExec & exec , const TeamPolicy<Arg0,Arg1,Qthread> & team )
-    : m_exec( exec )
-    , m_team_shared(0,0)
-    , m_team_size(   team.m_team_size )
-    , m_team_rank(   exec.shepherd_worker_rank() )
-    , m_league_size( team.m_league_size )
-    , m_league_end(  team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
-    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
-  {
-    m_exec.shared_reset( m_team_shared );
-  }
-
-  // Continue
-  operator bool () const { return m_league_rank < m_league_end ; }
-
-  // iterate
-  void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); }
-};
-
-} // namespace Impl
-
-template< class Arg0 , class Arg1 >
-class TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread >
-{
-private:
-
-  const int m_league_size ;
-  const int m_team_size ;
-  const int m_shepherd_iter ;
-
-public:
-
-  //! Tag this class as a kokkos execution policy
-  typedef TeamPolicy  execution_policy ;
-  typedef Qthread     execution_space ;
-
-  typedef typename
-    Impl::if_c< ! Impl::is_same< Kokkos::Qthread , Arg0 >::value , Arg0 , Arg1 >::type
-      work_tag ;
-
-  //----------------------------------------
-
-  template< class FunctorType >
-  inline static
-  int team_size_max( const FunctorType & )
-    { return Qthread::instance().shepherd_worker_size(); }
-
-  template< class FunctorType >
-  static int team_size_recommended( const FunctorType & f )
-    { return team_size_max( f ); }
-
-  //----------------------------------------
-
-  inline int team_size()   const { return m_team_size ; }
-  inline int league_size() const { return m_league_size ; }
-
-  // One active team per shepherd
-  TeamPolicy( Kokkos::Qthread & q
-            , const int league_size
-            , const int team_size
-            )
-    : m_league_size( league_size )
-    , m_team_size( team_size < q.shepherd_worker_size()
-                 ? team_size : q.shepherd_worker_size() )
-    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
-    {
-    }
-
-  // One active team per shepherd
-  TeamPolicy( const int league_size
-            , const int team_size
-            )
-    : m_league_size( league_size )
-    , m_team_size( team_size < Qthread::instance().shepherd_worker_size()
-                 ? team_size : Qthread::instance().shepherd_worker_size() )
-    , m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
-    {
-    }
-
-  typedef Impl::QthreadTeamPolicyMember member_type ;
-
-  friend class Impl::QthreadTeamPolicyMember ;
-};
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_QTHREADEXEC_HPP */
-
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
@ -1,418 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
-#define KOKKOS_QTHREAD_PARALLEL_HPP
-
-#include <vector>
-
-#include <Kokkos_Parallel.hpp>
-
-#include <impl/Kokkos_StaticAssert.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
-
-#include <Qthread/Kokkos_QthreadExec.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >  Policy ;
-
-  const FunctorType  m_func ;
-  const Policy       m_policy ;
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( i );
-      }
-    }
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( typename PType::work_tag() , i );
-      }
-    }
-
-  // Function is called once by every concurrent thread.
-  static void execute( QthreadExec & exec , const void * arg )
-  {
-    const ParallelFor & self = * ((const ParallelFor *) arg );
-
-    driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() ) );
-
-    // All threads wait for completion.
-    exec.exec_all_barrier();
-  }
-
-public:
-
-  ParallelFor( const FunctorType & functor
-             , const Policy      & policy
-             )
-    : m_func( functor )
-    , m_policy( policy )
-    {
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
-    }
-};
-
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >  Policy ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  const FunctorType  m_func ;
-  const Policy       m_policy ;
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , reference_type update
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( i , update );
-      }
-    }
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , reference_type update
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( typename PType::work_tag() , i , update );
-      }
-    }
-
-  static void execute( QthreadExec & exec , const void * arg )
-  {
-    const ParallelReduce & self = * ((const ParallelReduce *) arg );
-
-    driver( self.m_func
-          , ValueInit::init( self.m_func , exec.exec_all_reduce_value() )
-          , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() )
-          );
-
-    exec.template exec_all_reduce<FunctorType, typename Policy::work_tag >( self.m_func );
-  }
-
-public:
-
-  template< class HostViewType >
-  ParallelReduce( const FunctorType  & functor
-                , const Policy       & policy
-                , const HostViewType & result_view )
-    : m_func( functor )
-    , m_policy( policy )
-    {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
-
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
-
-      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
-
-      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
-
-      if ( result_view.ptr_on_device() ) {
-        const unsigned n = ValueTraits::value_count( m_func );
-        for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
-      }
-    }
-};
-
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class Arg0 , class Arg1 >
-class ParallelFor< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > >
-{
-private:
-
-  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread >  Policy ;
-
-  const FunctorType  m_func ;
-  const Policy       m_team ;
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION
-  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
-                 const typename Policy::member_type & >::type member ) const
-    { m_func( member ); }
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION
-  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
-                 const typename Policy::member_type & >::type member ) const
-    { m_func( TagType() , member ); }
-
-  static void execute( QthreadExec & exec , const void * arg )
-  {
-    const ParallelFor & self = * ((const ParallelFor *) arg );
-
-    typename Policy::member_type member( exec , self.m_team );
-
-    while ( member ) {
-      self.ParallelFor::template driver< typename Policy::work_tag >( member );
-      member.team_barrier();
-      member.next_team();
-    }
-
-    exec.exec_all_barrier();
-  }
-
-public:
-
-  ParallelFor( const FunctorType & functor ,
-               const Policy      & policy )
-    : m_func( functor )
-    , m_team( policy )
-    {
-      QthreadExec::resize_worker_scratch
-        ( /* reduction   memory */ 0
-        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
-
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
-    }
-};
-
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class Arg0 , class Arg1 >
-class ParallelReduce< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > >
-{
-private:
-
-  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread >  Policy ;
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  const FunctorType  m_func ;
-  const Policy       m_team ;
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION
-  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
-                 const typename Policy::member_type & >::type member
-             , reference_type update ) const
-    { m_func( member , update ); }
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION
-  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
-                 const typename Policy::member_type & >::type member
-             , reference_type update ) const
-    { m_func( TagType() , member , update ); }
-
-  static void execute( QthreadExec & exec , const void * arg )
-  {
-    const ParallelReduce & self = * ((const ParallelReduce *) arg );
-
-    // Initialize thread-local value
-    reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
-
-    typename Policy::member_type member( exec , self.m_team );
-
-    while ( member ) {
-      self.ParallelReduce::template driver< typename Policy::work_tag >( member , update );
-      member.team_barrier();
-      member.next_team();
-    }
-
-    exec.template exec_all_reduce< FunctorType , typename Policy::work_tag >( self.m_func );
-  }
-
-public:
-
-  template< class ViewType >
-  ParallelReduce( const FunctorType & functor ,
-                  const Policy      & policy ,
-                  const ViewType    & result )
-    : m_func( functor )
-    , m_team( policy )
-    {
-      QthreadExec::resize_worker_scratch
-        ( /* reduction   memory */ ValueTraits::value_size( functor )
-        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
-
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
-
-      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
-
-      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
-
-      const unsigned n = ValueTraits::value_count( m_func );
-      for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
-    }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >  Policy ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  const FunctorType  m_func ;
-  const Policy       m_policy ;
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , reference_type update
-             , const bool    final
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( i , update , final );
-      }
-    }
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , reference_type update
-             , const bool    final
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( typename PType::work_tag() , i , update , final );
-      }
-    }
-
-  static void execute( QthreadExec & exec , const void * arg )
-  {
-    const ParallelScan & self = * ((const ParallelScan *) arg );
-
-    const typename Policy::WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() );
-
-    // Initialize thread-local value
-    reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
-
-    driver( self.m_func , update , false , range );
-
-    exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_func );
-
-    driver( self.m_func , update , true , range );
-
-    exec.exec_all_barrier();
-  }
-
-public:
-
-  ParallelScan( const FunctorType & functor
-              , const Policy      & policy
-              )
-    : m_func( functor )
-    , m_policy( policy )
-    {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
-
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::execute , this );
-    }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */
-
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
@ -1,299 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-// Experimental unified task-data parallel manycore LDRD
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_HAVE_QTHREAD )
-
-#include <stdio.h>
-
-#include <stdlib.h>
-#include <stdexcept>
-#include <iostream>
-#include <sstream>
-#include <string>
-
-#include <Kokkos_Atomic.hpp>
-#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-typedef TaskMember< Kokkos::Qthread , void , void > Task ;
-
-namespace {
-
-inline
-unsigned padded_sizeof_derived( unsigned sizeof_derived )
-{
-  return sizeof_derived +
-    ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
-}
-
-} // namespace
-
-void Task::deallocate( void * ptr )
-{
-  // Counting on 'free' thread safety so lock/unlock not required.
-  // However, isolate calls here to mitigate future need to introduce lock/unlock.
-
-  // lock
-
-  free( ptr );
-
-  // unlock
-}
-
-void * Task::allocate( const unsigned arg_sizeof_derived
-                     , const unsigned arg_dependence_capacity )
-{
-  // Counting on 'malloc' thread safety so lock/unlock not required.
-  // However, isolate calls here to mitigate future need to introduce lock/unlock.
-
-  // lock
-
-  void * const ptr = malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
-
-  // unlock
-
-  return ptr ;
-}
-
-Task::~TaskMember()
-{
-
-}
-
-
-Task::TaskMember( const function_verify_type   arg_verify
-                , const function_dealloc_type  arg_dealloc
-                , const function_apply_type    arg_apply
-                , const unsigned               arg_sizeof_derived
-                , const unsigned               arg_dependence_capacity
-                )
-  : m_dealloc( arg_dealloc )
-  , m_verify(  arg_verify )
-  , m_apply(   arg_apply )
-  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
-  , m_dep_capacity( arg_dependence_capacity )
-  , m_dep_size( 0 )
-  , m_ref_count( 0 )
-  , m_state( Kokkos::TASK_STATE_CONSTRUCTING )
-  , m_qfeb(0)
-{
-  qthread_empty( & m_qfeb ); // Set to full when complete
-  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
-}
-
-Task::TaskMember( const function_dealloc_type  arg_dealloc
-                , const function_apply_type    arg_apply
-                , const unsigned               arg_sizeof_derived
-                , const unsigned               arg_dependence_capacity
-                )
-  : m_dealloc( arg_dealloc )
-  , m_verify(  & Task::verify_type<void> )
-  , m_apply(   arg_apply )
-  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
-  , m_dep_capacity( arg_dependence_capacity )
-  , m_dep_size( 0 )
-  , m_ref_count( 0 )
-  , m_state( Kokkos::TASK_STATE_CONSTRUCTING )
-  , m_qfeb(0)
-{
-  qthread_empty( & m_qfeb ); // Set to full when complete
-  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
-}
-
-//----------------------------------------------------------------------------
-
-void Task::throw_error_add_dependence() const
-{
-  std::cerr << "TaskMember< Qthread >::add_dependence ERROR"
-            << " state(" << m_state << ")"
-            << " dep_size(" << m_dep_size << ")"
-            << std::endl ;
-  throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR");
-}
-
-void Task::throw_error_verify_type()
-{
-  throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR");
-}
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
-{
-  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ;
-  static const char msg_error_count[]       = ": negative reference count" ;
-  static const char msg_error_complete[]    = ": destroy task that is not complete" ;
-  static const char msg_error_dependences[] = ": destroy task that has dependences" ;
-  static const char msg_error_exception[]   = ": caught internal exception" ;
-
-  const char * msg_error = 0 ;
-
-  try {
-
-    if ( *lhs ) {
-
-      const int count = Kokkos::atomic_fetch_add( & (**lhs).m_ref_count , -1 );
-
-      if ( 1 == count ) {
-
-        // Reference count at zero, delete it
-
-        // Should only be deallocating a completed task
-        if ( (**lhs).m_state == Kokkos::TASK_STATE_COMPLETE ) {
-
-          // A completed task should not have dependences...
-          for ( int i = 0 ; i < (**lhs).m_dep_size && 0 == msg_error ; ++i ) {
-            if ( (**lhs).m_dep[i] ) msg_error = msg_error_dependences ;
-          }
-        }
-        else {
-          msg_error = msg_error_complete ;
-        }
-
-        if ( 0 == msg_error ) {
-          // Get deletion function and apply it
-          const Task::function_dealloc_type d = (**lhs).m_dealloc ;
-
-          (*d)( *lhs );
-        }
-      }
-      else if ( count <= 0 ) {
-        msg_error = msg_error_count ;
-      }
-    }
-
-    if ( 0 == msg_error && rhs ) { Kokkos::atomic_fetch_add( & (*rhs).m_ref_count , 1 ); }
-
-    *lhs = rhs ;
-  }
-  catch( ... ) {
-    if ( 0 == msg_error ) msg_error = msg_error_exception ;
-  }
-
-  if ( 0 != msg_error ) {
-    if ( no_throw ) {
-      std::cerr << msg_error_header << msg_error << std::endl ;
-      std::cerr.flush();
-    }
-    else {
-      std::string msg(msg_error_header);
-      msg.append(msg_error);
-      throw std::runtime_error( msg );
-    }
-  }
-}
-#endif
-
-
-//----------------------------------------------------------------------------
-
-aligned_t Task::qthread_func( void * arg )
-{
-  Task * const task = reinterpret_cast< Task * >(arg);
-
-  task->m_state = Kokkos::TASK_STATE_EXECUTING ;
-
-  (*task->m_apply)( task );
-
-  if ( task->m_state == Kokkos::TASK_STATE_EXECUTING ) {
-    // Task did not respawn, is complete
-    task->m_state = Kokkos::TASK_STATE_COMPLETE ;
-
-    // Release dependences before allowing dependent tasks to run.
-    // Otherwise their is a thread race condition for removing dependences.
-    for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
-      assign( & task->m_dep[i] , 0 );
-    }
-
-    // Set qthread FEB to full so that dependent tasks are allowed to execute
-    qthread_fill( & task->m_qfeb );
-  }
-
-  return 0 ;
-}
-
-void Task::schedule()
-{
-  // Is waiting for execution
-
-  // spawn in qthread.  must malloc the precondition array and give to qthread.
-  // qthread will eventually free this allocation so memory will not be leaked.
-
-  // concern with thread safety of malloc, does this need to be guarded?
-  aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
-
-  qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
-
-  for ( int i = 0 ; i < m_dep_size ; ++i ) {
-    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
-  }
-
-  m_state = Kokkos::TASK_STATE_WAITING ;
-
-  qthread_spawn( & Task::qthread_func , this , 0 , NULL
-               , m_dep_size , qprecon
-               , NO_SHEPHERD , QTHREAD_SPAWN_SIMPLE );
-}
-
-void Task::wait( const Future< void, Kokkos::Qthread> & f )
-{
-  if ( f.m_task ) {
-    aligned_t tmp ;
-    qthread_readFF( & tmp , & f.m_task->m_qfeb );
-  }
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
-
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
@ -1,736 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-// Experimental unified task-data parallel manycore LDRD
-
-#ifndef KOKKOS_QTHREAD_TASKPOLICY_HPP
-#define KOKKOS_QTHREAD_TASKPOLICY_HPP
-
-#include <string>
-#include <typeinfo>
-#include <stdexcept>
-
-#include <qthread.h>
-
-#include <Kokkos_Qthread.hpp>
-#include <Kokkos_TaskPolicy.hpp>
-#include <Kokkos_View.hpp>
-
-#include <impl/Kokkos_FunctorAdapter.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template<>
-class TaskMember< Kokkos::Qthread , void , void >
-{
-public:
-
-  typedef void         (* function_apply_type)  ( TaskMember * );
-  typedef void         (* function_dealloc_type)( TaskMember * );
-  typedef TaskMember * (* function_verify_type) ( TaskMember * );
-
-private:
-
-  const function_dealloc_type  m_dealloc ; ///< Deallocation
-  const function_verify_type   m_verify ;  ///< Result type verification
-  const function_apply_type    m_apply ;   ///< Apply function
-  TaskMember ** const          m_dep ;     ///< Dependences
-  const int                    m_dep_capacity ; ///< Capacity of dependences
-  int                          m_dep_size ;     ///< Actual count of dependences
-  int                          m_ref_count ;    ///< Reference count
-  int                          m_state ;        ///< State of the task
-  aligned_t                    m_qfeb ;         ///< Qthread full/empty bit
-
-  TaskMember() /* = delete */ ;
-  TaskMember( const TaskMember & ) /* = delete */ ;
-  TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
-
-  static aligned_t qthread_func( void * arg );
-
-  static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
-  static void   deallocate( void * );
-
-  void throw_error_add_dependence() const ;
-  static void throw_error_verify_type();
-
-  template < class DerivedTaskType >
-  static
-  void deallocate( TaskMember * t )
-    {
-      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
-      ptr->~DerivedTaskType();
-      deallocate( (void *) ptr );
-    }
-
-protected :
-
-  ~TaskMember();
-
-  // Used by TaskMember< Qthread , ResultType , void >
-  TaskMember( const function_verify_type   arg_verify
-            , const function_dealloc_type  arg_dealloc
-            , const function_apply_type    arg_apply
-            , const unsigned               arg_sizeof_derived
-            , const unsigned               arg_dependence_capacity
-            );
-
-  // Used for TaskMember< Qthread , void , void >
-  TaskMember( const function_dealloc_type  arg_dealloc
-            , const function_apply_type    arg_apply
-            , const unsigned               arg_sizeof_derived
-            , const unsigned               arg_dependence_capacity
-            );
-
-public:
-
-  template< typename ResultType >
-  KOKKOS_FUNCTION static
-  TaskMember * verify_type( TaskMember * t )
-    {
-      enum { check_type = ! Impl::is_same< ResultType , void >::value };
-
-      if ( check_type && t != 0 ) {
-
-        // Verify that t->m_verify is this function
-        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
-
-        if ( t->m_verify != self ) {
-          t = 0 ;
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-          throw_error_verify_type();
-#endif
-        }
-      }
-      return t ;
-    }
-
-  //----------------------------------------
-  /*  Inheritence Requirements on task types:
-   *    typedef  FunctorType::value_type  value_type ;
-   *    class DerivedTaskType
-   *      : public TaskMember< Qthread , value_type , FunctorType >
-   *      { ... };
-   *    class TaskMember< Qthread , value_type , FunctorType >
-   *      : public TaskMember< Qthread , value_type , void >
-   *      , public Functor
-   *      { ... };
-   *  If value_type != void
-   *    class TaskMember< Qthread , value_type , void >
-   *      : public TaskMember< Qthread , void , void >
-   *
-   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
-   *
-   */
-
-  /** \brief  Allocate and construct a single-thread task */
-  template< class DerivedTaskType >
-  static
-  TaskMember * create( const typename DerivedTaskType::functor_type &  arg_functor
-                     , const unsigned                                  arg_dependence_capacity )
-    {
-      typedef typename DerivedTaskType::functor_type  functor_type ;
-      typedef typename functor_type::value_type       value_type ;
-
-      DerivedTaskType * const task =
-        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
-          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
-                         , & TaskMember::template apply_single< functor_type , value_type >
-                         , sizeof(DerivedTaskType)
-                         , arg_dependence_capacity
-                         , arg_functor );
-
-      return static_cast< TaskMember * >( task );
-    }
-
-  /** \brief  Allocate and construct a data parallel task */
-  template< class DerivedTaskType >
-  static
-  TaskMember * create( const typename DerivedTaskType::policy_type &   arg_policy
-                     , const typename DerivedTaskType::functor_type &  arg_functor
-                     , const unsigned                                  arg_dependence_capacity )
-    {
-      DerivedTaskType * const task =
-        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
-          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
-                         , sizeof(DerivedTaskType)
-                         , arg_dependence_capacity
-                         , arg_policy
-                         , arg_functor
-                         );
-
-      return static_cast< TaskMember * >( task );
-    }
-
-  void schedule();
-  static void wait( const Future< void , Kokkos::Qthread > & );
-
-  //----------------------------------------
-
-  typedef FutureValueTypeIsVoidError get_result_type ;
-
-  KOKKOS_INLINE_FUNCTION
-  get_result_type get() const { return get_result_type() ; }
-
-  KOKKOS_INLINE_FUNCTION
-  Kokkos::TaskState get_state() const { return Kokkos::TaskState( m_state ); }
-
-  //----------------------------------------
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-  static
-  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
-#else
-  KOKKOS_INLINE_FUNCTION static
-  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
-#endif
-
-  KOKKOS_INLINE_FUNCTION
-  TaskMember * get_dependence( int i ) const
-    { return ( Kokkos::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  int get_dependence() const
-    { return m_dep_size ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void clear_dependence()
-    {
-      for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
-      m_dep_size = 0 ;
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void add_dependence( TaskMember * before )
-    {
-      if ( ( Kokkos::TASK_STATE_CONSTRUCTING == m_state ||
-             Kokkos::TASK_STATE_EXECUTING    == m_state ) &&
-           m_dep_size < m_dep_capacity ) {
-        assign( m_dep + m_dep_size , before );
-        ++m_dep_size ;
-      }
-      else {
-        throw_error_add_dependence();
-      }
-    }
-
-  //----------------------------------------
-
-  template< class FunctorType , class ResultType >
-  KOKKOS_INLINE_FUNCTION static
-  void apply_single( typename Impl::enable_if< ! Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
-    {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
-
-      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
-      //   , public FunctorType
-      //   { ... };
-
-      derived_type & m = * static_cast< derived_type * >( t );
-
-      Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
-    }
-
-  template< class FunctorType , class ResultType >
-  KOKKOS_INLINE_FUNCTION static
-  void apply_single( typename Impl::enable_if< Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
-    {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
-
-      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
-      //   , public FunctorType
-      //   { ... };
-
-      derived_type & m = * static_cast< derived_type * >( t );
-
-      Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
-    }
-};
-
-//----------------------------------------------------------------------------
-/** \brief  Base class for tasks with a result value in the Qthread execution space.
- *
- *  The FunctorType must be void because this class is accessed by the
- *  Future class for the task and result value.
- *
- *  Must be derived from TaskMember<S,void,void> 'root class' so the Future class
- *  can correctly static_cast from the 'root class' to this class.
- */
-template < class ResultType >
-class TaskMember< Kokkos::Qthread , ResultType , void >
-  : public TaskMember< Kokkos::Qthread , void , void >
-{
-public:
-
-  ResultType  m_result ;
-
-  typedef const ResultType & get_result_type ;
-
-  KOKKOS_INLINE_FUNCTION
-  get_result_type get() const { return m_result ; }
-
-protected:
-
-  typedef TaskMember< Kokkos::Qthread , void , void >  task_root_type ;
-  typedef task_root_type::function_dealloc_type       function_dealloc_type ;
-  typedef task_root_type::function_apply_type         function_apply_type ;
-
-  inline
-  TaskMember( const function_dealloc_type  arg_dealloc
-            , const function_apply_type    arg_apply
-            , const unsigned               arg_sizeof_derived
-            , const unsigned               arg_dependence_capacity
-            )
-    : task_root_type( & task_root_type::template verify_type< ResultType >
-                    , arg_dealloc
-                    , arg_apply
-                    , arg_sizeof_derived
-                    , arg_dependence_capacity )
-    , m_result()
-    {}
-
-};
-
-template< class ResultType , class FunctorType >
-class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-  : public TaskMember< Kokkos::Qthread , ResultType , void >
-  , public FunctorType
-{
-public:
-
-  typedef FunctorType  functor_type ;
-
-  typedef TaskMember< Kokkos::Qthread , void , void >        task_root_type ;
-  typedef TaskMember< Kokkos::Qthread , ResultType , void >  task_base_type ;
-  typedef task_root_type::function_dealloc_type             function_dealloc_type ;
-  typedef task_root_type::function_apply_type               function_apply_type ;
-
-  inline
-  TaskMember( const function_dealloc_type  arg_dealloc
-            , const function_apply_type    arg_apply
-            , const unsigned               arg_sizeof_derived
-            , const unsigned               arg_dependence_capacity
-            , const functor_type &         arg_functor
-            )
-    : task_base_type( arg_dealloc , arg_apply , arg_sizeof_derived , arg_dependence_capacity )
-    , functor_type( arg_functor )
-    {}
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  ForEach task in the Qthread execution space
- *
- *  Derived from TaskMember< Kokkos::Qthread , ResultType , FunctorType >
- *  so that Functor can be cast to task root type without knowing policy.
- */
-template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType >
-class TaskForEach< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >
-                 , ResultType
-                 , FunctorType >
-  : TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-{
-public:
-
-  typedef FunctorType                                              functor_type ;
-  typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >       policy_type ;
-
-private:
-
-  friend class Kokkos::TaskPolicy< Kokkos::Qthread > ;
-  friend class Kokkos::Impl::TaskMember< Kokkos::Qthread , void , void > ;
-
-  typedef TaskMember< Kokkos::Qthread , void , void >               task_root_type ;
-  typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType >  task_base_type ;
-  typedef task_root_type::function_dealloc_type                    function_dealloc_type ;
-
-  policy_type  m_policy ;
-
-  template< class Tag >
-  inline
-  typename Impl::enable_if< Impl::is_same<Tag,void>::value >::type
-    apply_policy() const
-    {
-      const typename policy_type::member_type e = m_policy.end();
-      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
-        functor_type::operator()(i);
-      }
-    }
-
-  template< class Tag >
-  inline
-  typename Impl::enable_if< ! Impl::is_same<Tag,void>::value >::type
-    apply_policy() const
-    {
-      const Tag tag ;
-      const typename policy_type::member_type e = m_policy.end();
-      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
-        functor_type::operator()(tag,i);
-      }
-    }
-
-  static
-  void apply_parallel( task_root_type * t )
-    {
-      static_cast<TaskForEach*>(t)->template apply_policy< typename policy_type::work_tag >();
-
-      task_root_type::template apply_single< functor_type , ResultType >( t );
-    }
-
-  TaskForEach( const function_dealloc_type  arg_dealloc
-             , const int                    arg_sizeof_derived
-             , const int                    arg_dependence_capacity
-             , const policy_type &          arg_policy
-             , const functor_type &         arg_functor
-             )
-    : task_base_type( arg_dealloc
-                    , & apply_parallel
-                    , arg_sizeof_derived
-                    , arg_dependence_capacity
-                    , arg_functor )
-    , m_policy( arg_policy )
-    {}
-
-  TaskForEach() /* = delete */ ;
-  TaskForEach( const TaskForEach & ) /* = delete */ ;
-  TaskForEach & operator = ( const TaskForEach & ) /* = delete */ ;
-};
-
-//----------------------------------------------------------------------------
-/** \brief  Reduce task in the Qthread execution space
- *
- *  Derived from TaskMember< Kokkos::Qthread , ResultType , FunctorType >
- *  so that Functor can be cast to task root type without knowing policy.
- */
-template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType >
-class TaskReduce< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >
-                , ResultType
-                , FunctorType >
-  : TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-{
-public:
-
-  typedef FunctorType                                              functor_type ;
-  typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >       policy_type ;
-
-private:
-
-  friend class Kokkos::TaskPolicy< Kokkos::Qthread > ;
-  friend class Kokkos::Impl::TaskMember< Kokkos::Qthread , void , void > ;
-
-  typedef TaskMember< Kokkos::Qthread , void , void >               task_root_type ;
-  typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType >  task_base_type ;
-  typedef task_root_type::function_dealloc_type                    function_dealloc_type ;
-
-  policy_type  m_policy ;
-
-  template< class Tag >
-  inline
-  void apply_policy( typename Impl::enable_if< Impl::is_same<Tag,void>::value , ResultType & >::type result ) const
-    {
-      Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result );
-      const typename policy_type::member_type e = m_policy.end();
-      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
-        functor_type::operator()( i, result );
-      }
-    }
-
-  template< class Tag >
-  inline
-  void apply_policy( typename Impl::enable_if< ! Impl::is_same<Tag,void>::value , ResultType & >::type result ) const
-    {
-      Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result );
-      const Tag tag ;
-      const typename policy_type::member_type e = m_policy.end();
-      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
-        functor_type::operator()( tag, i, result );
-      }
-    }
-
-  static
-  void apply_parallel( task_root_type * t )
-    {
-      TaskReduce * const task = static_cast<TaskReduce*>(t);
-
-      task->template apply_policy< typename policy_type::work_tag >( task->task_base_type::m_result );
-
-      task_root_type::template apply_single< functor_type , ResultType >( t );
-    }
-
-  TaskReduce( const function_dealloc_type  arg_dealloc
-            , const int                    arg_sizeof_derived
-            , const int                    arg_dependence_capacity
-            , const policy_type &          arg_policy
-            , const functor_type &         arg_functor
-            )
-    : task_base_type( arg_dealloc
-                    , & apply_parallel
-                    , arg_sizeof_derived
-                    , arg_dependence_capacity
-                    , arg_functor )
-    , m_policy( arg_policy )
-    {}
-
-  TaskReduce() /* = delete */ ;
-  TaskReduce( const TaskReduce & ) /* = delete */ ;
-  TaskReduce & operator = ( const TaskReduce & ) /* = delete */ ;
-};
-
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template<>
-class TaskPolicy< Kokkos::Qthread >
-{
-public:
-
-  typedef Kokkos::Qthread execution_space ;
-
-private:
-
-  typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
-
-  TaskPolicy & operator = ( const TaskPolicy & ) /* = delete */ ;
-
-  template< class FunctorType >
-  static inline
-  const task_root_type * get_task_root( const FunctorType * f )
-    {
-      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
-      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
-    }
-
-  template< class FunctorType >
-  static inline
-  task_root_type * get_task_root( FunctorType * f )
-    {
-      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
-      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
-    }
-
-  const unsigned m_default_dependence_capacity ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  TaskPolicy() : m_default_dependence_capacity(4) {}
-
-  KOKKOS_INLINE_FUNCTION
-  TaskPolicy( const TaskPolicy & rhs ) : m_default_dependence_capacity( rhs.m_default_dependence_capacity ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  explicit
-  TaskPolicy( const unsigned arg_default_dependence_capacity )
-    : m_default_dependence_capacity( arg_default_dependence_capacity ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  TaskPolicy( const TaskPolicy &
-            , const unsigned arg_default_dependence_capacity )
-    : m_default_dependence_capacity( arg_default_dependence_capacity ) {}
-
-  //----------------------------------------
-
-  template< class ValueType >
-  const Future< ValueType , execution_space > &
-    spawn( const Future< ValueType , execution_space > & f ) const
-      {
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        f.m_task->schedule();
-#endif
-        return f ;
-      }
-
-  // Create single-thread task
-
-  template< class FunctorType >
-  Future< typename FunctorType::value_type , execution_space >
-  create( const FunctorType & functor
-        , const unsigned dependence_capacity = ~0u ) const
-    {
-      typedef typename FunctorType::value_type value_type ;
-      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
-      return Future< value_type , execution_space >(
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        task_root_type::create< task_type >(
-          functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
-#endif
-        );
-    }
-
-  // Create parallel foreach task
-
-  template< class PolicyType , class FunctorType >
-  Future< typename FunctorType::value_type , execution_space >
-  create_foreach( const PolicyType  & policy
-                , const FunctorType & functor
-                , const unsigned      dependence_capacity = ~0u ) const
-    {
-      typedef typename FunctorType::value_type value_type ;
-      typedef Impl::TaskForEach< PolicyType , value_type , FunctorType > task_type ;
-      return Future< value_type , execution_space >(
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        task_root_type::create< task_type >( policy , functor ,
-          ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
-#endif
-        );
-    }
-
-  // Create parallel reduce task
-
-  template< class PolicyType , class FunctorType >
-  Future< typename FunctorType::value_type , execution_space >
-  create_reduce( const PolicyType  & policy
-               , const FunctorType & functor
-               , const unsigned      dependence_capacity = ~0u ) const
-    {
-      typedef typename FunctorType::value_type value_type ;
-      typedef Impl::TaskReduce< PolicyType , value_type , FunctorType > task_type ;
-      return Future< value_type , execution_space >(
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        task_root_type::create< task_type >( policy , functor ,
-          ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
-#endif
-        );
-    }
-
-  // Add dependence
-  template< class A1 , class A2 , class A3 , class A4 >
-  void add_dependence( const Future<A1,A2> & after
-                     , const Future<A3,A4> & before
-                     , typename Impl::enable_if
-                        < Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
-                          &&
-                          Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
-                        >::type * = 0
-                      )
-    {
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      after.m_task->add_dependence( before.m_task );
-#endif
-    }
-
-  //----------------------------------------
-  // Functions for an executing task functor to query dependences,
-  // set new dependences, and respawn itself.
-
-  template< class FunctorType >
-  Future< void , execution_space >
-  get_dependence( const FunctorType * task_functor , int i ) const
-    {
-      return Future<void,execution_space>(
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        get_task_root(task_functor)->get_dependence(i)
-#endif
-        );
-    }
-
-  template< class FunctorType >
-  int get_dependence( const FunctorType * task_functor ) const
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return get_task_root(task_functor)->get_dependence(); }
-#else
-    { return 0 ; }
-#endif
-
-  template< class FunctorType >
-  void clear_dependence( FunctorType * task_functor ) const
-    {
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      get_task_root(task_functor)->clear_dependence();
-#endif
-    }
-
-  template< class FunctorType , class A3 , class A4 >
-  void add_dependence( FunctorType * task_functor
-                     , const Future<A3,A4> & before
-                     , typename Impl::enable_if
-                        < Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
-                        >::type * = 0
-                      )
-    {
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      get_task_root(task_functor)->add_dependence( before.m_task );
-#endif
-    }
-
-  template< class FunctorType >
-  void respawn( FunctorType * task_functor ) const
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { get_task_root(task_functor)->schedule(); }
-#else
-    {}
-#endif
-
-};
-
-inline
-void wait( TaskPolicy< Kokkos::Qthread > & );
-
-inline
-void wait( const Future< void , Kokkos::Qthread > & future )
-{ Impl::TaskMember< Kokkos::Qthread , void , void >::wait( future ); }
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
-
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@ -1,745 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD )
-
-#include <stdint.h>
-#include <limits>
-#include <utility>
-#include <iostream>
-#include <sstream>
-#include <Kokkos_Threads.hpp>
-#include <Kokkos_hwloc.hpp>
-#include <Kokkos_Atomic.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-ThreadsExec                  s_threads_process ;
-ThreadsExec                * s_threads_exec[  ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
-pthread_t                    s_threads_pid[   ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
-std::pair<unsigned,unsigned> s_threads_coord[ ThreadsExec::MAX_THREAD_COUNT ];
-
-int s_thread_pool_size[3] = { 0 , 0 , 0 };
-
-unsigned s_current_reduce_size = 0 ;
-unsigned s_current_shared_size = 0 ;
-
-void (* volatile s_current_function)( ThreadsExec & , const void * );
-const void * volatile s_current_function_arg = 0 ;
-
-struct Sentinel {
-  Sentinel()
-  {
-    HostSpace::register_in_parallel( ThreadsExec::in_parallel );
-  }
-
-  ~Sentinel()
-  {
-    if ( s_thread_pool_size[0] ||
-         s_thread_pool_size[1] ||
-         s_thread_pool_size[2] ||
-         s_current_reduce_size ||
-         s_current_shared_size ||
-         s_current_function ||
-         s_current_function_arg ||
-         s_threads_exec[0] ) {
-      std::cerr << "ERROR : Process exiting without calling Kokkos::Threads::terminate()" << std::endl ;
-    }
-  }
-};
-
-inline
-unsigned fan_size( const unsigned rank , const unsigned size )
-{
-  const unsigned rank_rev = size - ( rank + 1 );
-  unsigned count = 0 ;
-  for ( unsigned n = 1 ; ( rank_rev + n < size ) && ! ( rank_rev & n ) ; n <<= 1 ) { ++count ; }
-  return count ;
-}
-
-} // namespace
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-void execute_function_noop( ThreadsExec & , const void * ) {}
-
-void ThreadsExec::driver(void)
-{
-  ThreadsExec this_thread ;
-
-  while ( ThreadsExec::Active == this_thread.m_pool_state ) {
-
-    (*s_current_function)( this_thread , s_current_function_arg );
-
-    // Deactivate thread and wait for reactivation
-    this_thread.m_pool_state = ThreadsExec::Inactive ;
-
-    wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive );
-  }
-}
-
-ThreadsExec::ThreadsExec()
-  : m_pool_base(0)
-  , m_scratch(0)
-  , m_scratch_reduce_end(0)
-  , m_scratch_thread_end(0)
-  , m_pool_rank(0)
-  , m_pool_size(0)
-  , m_pool_fan_size(0)
-  , m_pool_state( ThreadsExec::Terminating )
-{
-  if ( & s_threads_process != this ) {
-
-    // A spawned thread
-
-    ThreadsExec * const nil = 0 ;
-
-    // Which entry in 's_threads_exec', possibly determined from hwloc binding
-    const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0])
-                    ? ((size_t)s_current_function_arg)
-                    : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord ));
-
-    // Given a good entry set this thread in the 's_threads_exec' array
-    if ( entry < s_thread_pool_size[0] &&
-         nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) {
-
-      m_pool_base     = s_threads_exec ;
-      m_pool_rank     = s_thread_pool_size[0] - ( entry + 1 );
-      m_pool_size     = s_thread_pool_size[0] ;
-      m_pool_fan_size = fan_size( m_pool_rank , m_pool_size );
-      m_pool_state    = ThreadsExec::Active ;
-
-      s_threads_pid[ m_pool_rank ] = pthread_self();
-
-      // Inform spawning process that the threads_exec entry has been set.
-      s_threads_process.m_pool_state = ThreadsExec::Active ;
-    }
-    else {
-      // Inform spawning process that the threads_exec entry could not be set.
-      s_threads_process.m_pool_state = ThreadsExec::Terminating ;
-    }
-  }
-  else {
-    // Enables 'parallel_for' to execute on unitialized Threads device
-    m_pool_rank  = 0 ;
-    m_pool_size  = 1 ;
-    m_pool_state = ThreadsExec::Inactive ;
-
-    s_threads_pid[ m_pool_rank ] = pthread_self();
-  }
-}
-
-ThreadsExec::~ThreadsExec()
-{
-  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
-
-  m_pool_base   = 0 ;
-  m_scratch     = 0 ;
-  m_scratch_reduce_end = 0 ;
-  m_scratch_thread_end = 0 ;
-  m_pool_rank     = 0 ;
-  m_pool_size     = 0 ;
-  m_pool_fan_size = 0 ;
-
-  m_pool_state  = ThreadsExec::Terminating ;
-
-  if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) {
-    ThreadsExec * const nil = 0 ;
-
-    atomic_compare_exchange( s_threads_exec + entry , this , nil );
-
-    s_threads_process.m_pool_state = ThreadsExec::Terminating ;
-  }
-}
-
-
-int ThreadsExec::get_thread_count()
-{
-  return s_thread_pool_size[0] ;
-}
-
-ThreadsExec * ThreadsExec::get_thread( const int init_thread_rank )
-{
-  ThreadsExec * const th =
-    init_thread_rank < s_thread_pool_size[0]
-    ? s_threads_exec[ s_thread_pool_size[0] - ( init_thread_rank + 1 ) ] : 0 ;
-
-  if ( 0 == th || th->m_pool_rank != init_thread_rank ) {
-    std::ostringstream msg ;
-    msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : "
-        << "thread " << init_thread_rank << " of " << s_thread_pool_size[0] ;
-    if ( 0 == th ) {
-      msg << " does not exist" ;
-    }
-    else {
-      msg << " has wrong thread_rank " << th->m_pool_rank ;
-    }
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-
-  return th ;
-}
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::execute_get_binding( ThreadsExec & exec , const void * )
-{
-  s_threads_coord[ exec.m_pool_rank ] = Kokkos::hwloc::get_this_thread_coordinate();
-}
-
-void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
-{
-  ThreadsExec::global_lock();
-  ThreadsExec::global_unlock();
-
-  const int n = exec.m_pool_fan_size ;
-  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
-
-  for ( int i = 0 ; i < n ; ++i ) {
-    Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
-  }
-
-  exec.m_pool_state = ThreadsExec::Inactive ;
-}
-
-}
-}
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-void ThreadsExec::verify_is_process( const std::string & name , const bool initialized )
-{
-  if ( ! is_process() ) {
-    std::string msg( name );
-    msg.append( " FAILED : Called by a worker thread, can only be called by the master process." );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-
-  if ( initialized && 0 == s_thread_pool_size[0] ) {
-    std::string msg( name );
-    msg.append( " FAILED : Threads not initialized." );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-}
-
-int ThreadsExec::in_parallel()
-{
-  // A thread function is in execution and
-  // the function argument is not the special threads process argument and
-  // the master process is a worker or is not the master process.
-  return s_current_function &&
-         ( & s_threads_process != s_current_function_arg ) &&
-         ( s_threads_process.m_pool_base || ! is_process() );
-}
-
-// Wait for root thread to become inactive
-void ThreadsExec::fence()
-{
-  if ( s_thread_pool_size[0] ) {
-    // Wait for the root thread to complete:
-    Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
-  }
-
-  s_current_function     = 0 ;
-  s_current_function_arg = 0 ;
-}
-
-/** \brief  Begin execution of the asynchronous functor */
-void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg )
-{
-  verify_is_process("ThreadsExec::start" , true );
-
-  if ( s_current_function || s_current_function_arg ) {
-    Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) );
-  }
-
-  s_current_function     = func ;
-  s_current_function_arg = arg ;
-
-  // Activate threads:
-  for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) {
-    s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
-  }
-
-  if ( s_threads_process.m_pool_size ) {
-    // Master process is the root thread, run it:
-    (*func)( s_threads_process , arg );
-    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
-  }
-}
-
-//----------------------------------------------------------------------------
-
-bool ThreadsExec::sleep()
-{
-  verify_is_process("ThreadsExec::sleep", true );
-
-  if ( & execute_sleep == s_current_function ) return false ;
-
-  fence();
-
-  ThreadsExec::global_lock();
-
-  s_current_function = & execute_sleep ;
-
-  // Activate threads:
-  for ( unsigned i = s_thread_pool_size[0] ; 0 < i ; ) {
-    s_threads_exec[--i]->m_pool_state = ThreadsExec::Active ;
-  }
-
-  return true ;
-}
-
-bool ThreadsExec::wake()
-{
-  verify_is_process("ThreadsExec::wake", true );
-
-  if ( & execute_sleep != s_current_function ) return false ;
-
-  ThreadsExec::global_unlock();
-
-  if ( s_threads_process.m_pool_base ) {
-    execute_sleep( s_threads_process , 0 );
-    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
-  }
-
-  fence();
-
-  return true ;
-}
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
-{
-  s_current_function = func ;
-  s_current_function_arg = & s_threads_process ;
-
-  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
-
-  for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
-    ThreadsExec & th = * s_threads_exec[ --i ];
-
-    th.m_pool_state = ThreadsExec::Active ;
-
-    wait_yield( th.m_pool_state , ThreadsExec::Active );
-  }
-
-  if ( s_threads_process.m_pool_base ) {
-    s_threads_process.m_pool_state = ThreadsExec::Active ;
-    (*func)( s_threads_process , 0 );
-    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
-  }
-
-  s_current_function_arg = 0 ;
-  s_current_function = 0 ;
-}
-
-//----------------------------------------------------------------------------
-
-void * ThreadsExec::root_reduce_scratch()
-{
-  return s_threads_process.reduce_memory();
-}
-
-void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
-{
-  if ( exec.m_scratch ) {
-    HostSpace::decrement( exec.m_scratch );
-    exec.m_scratch = 0 ;
-  }
-
-  exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
-  exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
-
-  if ( s_threads_process.m_scratch_thread_end ) {
-
-    exec.m_scratch =
-      HostSpace::allocate( "thread_scratch" , s_threads_process.m_scratch_thread_end );
-
-    unsigned * ptr = (unsigned *)( exec.m_scratch );
-    unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
-
-    // touch on this thread
-    while ( ptr < end ) *ptr++ = 0 ;
-  }
-}
-
-void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
-{
-  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
-
-  fence();
-
-  const size_t old_reduce_size = s_threads_process.m_scratch_reduce_end ;
-  const size_t old_thread_size = s_threads_process.m_scratch_thread_end - s_threads_process.m_scratch_reduce_end ;
-
-  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
-  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
-
-  // Increase size or deallocate completely.
-
-  if ( ( old_reduce_size < reduce_size ) ||
-       ( old_thread_size < thread_size ) ||
-       ( ( reduce_size == 0 && thread_size == 0 ) &&
-         ( old_reduce_size != 0 || old_thread_size != 0 ) ) ) {
-
-    verify_is_process( "ThreadsExec::resize_scratch" , true );
-
-    s_threads_process.m_scratch_reduce_end = reduce_size ;
-    s_threads_process.m_scratch_thread_end = reduce_size + thread_size ;
-
-    execute_serial( & execute_resize_scratch );
-
-    s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
-  }
-
-  return s_threads_process.m_scratch ;
-}
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
-{
-  verify_is_process("ThreadsExec::print_configuration",false);
-
-  fence();
-
-  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-  // Forestall compiler warnings for unused variables.
-  (void) numa_count;
-  (void) cores_per_numa;
-  (void) threads_per_core;
-
-  s << "Kokkos::Threads" ;
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-  s << " KOKKOS_HAVE_PTHREAD" ;
-#endif
-#if defined( KOKKOS_HAVE_HWLOC )
-  s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
-#endif
-
-  if ( s_thread_pool_size[0] ) {
-    s << " threads[" << s_thread_pool_size[0] << "]"
-      << " threads_per_numa[" << s_thread_pool_size[1] << "]"
-      << " threads_per_core[" << s_thread_pool_size[2] << "]"
-      ;
-    if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; }
-    s << " ReduceScratch[" << s_current_reduce_size << "]"
-      << " SharedScratch[" << s_current_shared_size << "]" ;
-    s << std::endl ;
-
-    if ( detail ) {
-
-      execute_serial( & execute_get_binding );
-
-      for ( int i = 0 ; i < s_thread_pool_size[0] ; ++i ) {
-        ThreadsExec * const th = s_threads_exec[i] ;
-        s << "  Thread hwloc("
-          << s_threads_coord[i].first << "."
-          << s_threads_coord[i].second << ")" ;
-
-        s_threads_coord[i].first  = ~0u ;
-        s_threads_coord[i].second = ~0u ;
-
-        if ( th ) {
-          const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 );
-
-          s << " rank(" << th->m_pool_rank << ")" ;
-
-          if ( th->m_pool_fan_size ) {
-            s << " Fan{" ;
-            for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) {
-              s << " " << th->m_pool_base[rank_rev+(1<<j)]->m_pool_rank ;
-            }
-            s << " }" ;
-          }
-
-          if ( th == & s_threads_process ) {
-            s << " is_process" ;
-          }
-        }
-        s << std::endl ;
-      }
-    }
-  }
-  else {
-    s << " not initialized" << std::endl ;
-  }
-}
-
-//----------------------------------------------------------------------------
-
-int ThreadsExec::is_initialized()
-{ return 0 != s_threads_exec[0] ; }
-
-void ThreadsExec::initialize( unsigned thread_count ,
-                              unsigned use_numa_count ,
-                              unsigned use_cores_per_numa ,
-                              bool allow_asynchronous_threadpool )
-{
-  static const Sentinel sentinel ;
-
-  const bool is_initialized = 0 != s_thread_pool_size[0] ;
-
-  unsigned thread_spawn_failed = 0 ;
-
-  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
-    s_threads_exec[i] = NULL;
-
-  if ( ! is_initialized ) {
-
-    // If thread_count, use_numa_count, or use_cores_per_numa are zero
-    // then they will be given default values based upon hwloc detection
-    // and allowed asynchronous execution.
-
-    const bool hwloc_avail = hwloc::available();
-
-    const unsigned thread_spawn_begin =
-      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
-                             allow_asynchronous_threadpool ,
-                             thread_count ,
-                             use_numa_count ,
-                             use_cores_per_numa ,
-                             s_threads_coord );
-
-    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;
-
-    if ( thread_spawn_begin ) {
-      // Synchronous with s_threads_coord[0] as the process core
-      // Claim entry #0 for binding the process core.
-      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
-    }
-
-    s_thread_pool_size[0]    = thread_count ;
-    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
-    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
-    s_current_function = & execute_function_noop ; // Initialization work function
-
-    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
-
-      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
-
-      // If hwloc available then spawned thread will
-      // choose its own entry in 's_threads_coord'
-      // otherwise specify the entry.
-      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith );
-
-      // Spawn thread executing the 'driver()' function.
-      // Wait until spawned thread has attempted to initialize.
-      // If spawning and initialization is successfull then
-      // an entry in 's_threads_exec' will be assigned.
-      if ( ThreadsExec::spawn() ) {
-        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
-      }
-      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
-    }
-
-    // Wait for all spawned threads to deactivate before zeroing the function.
-
-    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
-      // Try to protect against cache coherency failure by casting to volatile.
-      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
-      if ( th ) {
-        wait_yield( th->m_pool_state , ThreadsExec::Active );
-      }
-      else {
-        ++thread_spawn_failed ;
-      }
-    }
-
-    s_current_function     = 0 ;
-    s_current_function_arg = 0 ;
-    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
-
-    if ( ! thread_spawn_failed ) {
-      // Bind process to the core on which it was located before spawning occured
-      Kokkos::hwloc::bind_this_thread( proc_coord );
-
-      if ( thread_spawn_begin ) { // Include process in pool.
-        s_threads_exec[0]                 = & s_threads_process ;
-        s_threads_process.m_pool_base     = s_threads_exec ;
-        s_threads_process.m_pool_rank     = thread_count - 1 ; // Reversed for scan-compatible reductions
-        s_threads_process.m_pool_size     = thread_count ;
-        s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
-        s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
-      }
-      else {
-        s_threads_process.m_pool_base = 0 ;
-        s_threads_process.m_pool_rank = 0 ;
-        s_threads_process.m_pool_size = 0 ;
-        s_threads_process.m_pool_fan_size = 0 ;
-      }
-
-      // Initial allocations:
-      ThreadsExec::resize_scratch( 1024 , 1024 );
-    }
-    else {
-      s_thread_pool_size[0]    = 0 ;
-      s_thread_pool_size[1] = 0 ;
-      s_thread_pool_size[2] = 0 ;
-    }
-  }
-
-  if ( is_initialized || thread_spawn_failed ) {
-
-    std::ostringstream msg ;
-
-    msg << "Kokkos::Threads::initialize ERROR" ;
-
-    if ( is_initialized ) {
-      msg << " : already initialized" ;
-    }
-    if ( thread_spawn_failed ) {
-      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
-    }
-
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-}
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::finalize()
-{
-  verify_is_process("ThreadsExec::finalize",false);
-
-  fence();
-
-  resize_scratch(0,0);
-
-  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
-
-  for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) {
-
-    if ( s_threads_exec[i] ) {
-
-      s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ;
-
-      wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
-
-      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
-    }
-
-    s_threads_pid[i] = 0 ;
-  }
-
-  if ( s_threads_process.m_pool_base ) {
-    ( & s_threads_process )->~ThreadsExec();
-    s_threads_exec[0] = 0 ;
-  }
-
-  Kokkos::hwloc::unbind_this_thread();
-
-  s_thread_pool_size[0] = 0 ;
-  s_thread_pool_size[1] = 0 ;
-  s_thread_pool_size[2] = 0 ;
-
-  // Reset master thread to run solo.
-  s_threads_process.m_pool_base     = 0 ;
-  s_threads_process.m_pool_rank     = 0 ;
-  s_threads_process.m_pool_size     = 1 ;
-  s_threads_process.m_pool_fan_size = 0 ;
-  s_threads_process.m_pool_state = ThreadsExec::Inactive ;
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-Threads & Threads::instance(int)
-{
-  static Threads t ;
-  return t ;
-}
-
-int Threads::thread_pool_size( int depth )
-{
-  return Impl::s_thread_pool_size[depth];
-}
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-int Threads::thread_pool_rank()
-{
-  const pthread_t pid = pthread_self();
-  int i = 0;
-  while ( ( i < Impl::s_thread_pool_size[0] ) && ( pid != Impl::s_threads_pid[i] ) ) { ++i ; }
-  return i ;
-}
-#endif
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD ) */
-
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@ -1,254 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core_fwd.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-
-/* Standard 'C' Linux libraries */
-
-#include <pthread.h>
-#include <sched.h>
-#include <errno.h>
-
-/* Standard C++ libaries */
-
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <stdexcept>
-
-#include <Kokkos_Threads.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER ;
-
-// Pthreads compatible driver.
-// Recovery from an exception would require constant intra-thread health
-// verification; which would negatively impact runtime.  As such simply
-// abort the process.
-
-void * internal_pthread_driver( void * )
-{
-  try {
-    ThreadsExec::driver();
-  }
-  catch( const std::exception & x ) {
-    std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl ;
-    std::cerr.flush();
-    std::abort();
-  }
-  catch( ... ) {
-    std::cerr << "Exception thrown from worker thread" << std::endl ;
-    std::cerr.flush();
-    std::abort();
-  }
-  return NULL ;
-}
-
-} // namespace
-
-//----------------------------------------------------------------------------
-// Spawn a thread
-
-bool ThreadsExec::spawn()
-{
-  bool result = false ;
-
-  pthread_attr_t attr ;
-
-  if ( 0 == pthread_attr_init( & attr ) ||
-       0 == pthread_attr_setscope(       & attr, PTHREAD_SCOPE_SYSTEM ) ||
-       0 == pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
-
-    pthread_t pt ;
-
-    result = 0 == pthread_create( & pt, & attr, internal_pthread_driver, 0 );
-  }
-
-  pthread_attr_destroy( & attr );
-
-  return result ;
-}
-
-//----------------------------------------------------------------------------
-
-bool ThreadsExec::is_process()
-{
-  static const pthread_t master_pid = pthread_self();
-
-  return pthread_equal( master_pid , pthread_self() );
-}
-
-void ThreadsExec::global_lock()
-{
-  pthread_mutex_lock( & host_internal_pthread_mutex );
-}
-
-void ThreadsExec::global_unlock()
-{
-  pthread_mutex_unlock( & host_internal_pthread_mutex );
-}
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::wait_yield( volatile int & flag , const int value )
-{
-  while ( value == flag ) { sched_yield(); }
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-/* end #if defined( KOKKOS_HAVE_PTHREAD ) */
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#elif defined( KOKKOS_HAVE_WINTHREAD )
-
-/* Windows libraries */
-#include <windows.h>
-#include <process.h>
-
-/* Standard C++ libaries */
-
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <stdexcept>
-
-#include <Kokkos_Threads.hpp>
-
-//----------------------------------------------------------------------------
-// Driver for each created pthread
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-unsigned WINAPI internal_winthread_driver( void * arg )
-{
-  ThreadsExec::driver();
-
-  return 0 ;
-}
-
-class ThreadLockWindows {
-private:
-  CRITICAL_SECTION  m_handle ;
-
-  ~ThreadLockWindows()
-  { DeleteCriticalSection( & m_handle ); }
-
-  ThreadLockWindows();
-  { InitializeCriticalSection( & m_handle ); }
-
-  ThreadLockWindows( const ThreadLockWindows & );
-  ThreadLockWindows & operator = ( const ThreadLockWindows & );
-
-public:
-
-  static ThreadLockWindows & singleton();
-
-  void lock()
-  { EnterCriticalSection( & m_handle ); }
-
-  void unlock()
-  { LeaveCriticalSection( & m_handle ); }
-};
-
-ThreadLockWindows & ThreadLockWindows::singleton()
-{ static ThreadLockWindows self ; return self ; }
-
-} // namespace <>
-} // namespace Kokkos
-} // namespace Impl
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-// Spawn this thread
-
-bool ThreadsExec::spawn()
-{
-  unsigned Win32ThreadID = 0 ;
-
-  HANDLE handle =
-    _beginthreadex(0,0,internal_winthread_driver,0,0, & Win32ThreadID );
-
-  return ! handle ;
-}
-
-bool ThreadsExec::is_process() { return true ; }
-
-void ThreadsExec::global_lock()
-{ ThreadLockWindows::singleton().lock(); }
-
-void ThreadsExec::global_unlock()
-{ ThreadLockWindows::singleton().unlock(); }
-
-void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
-{
-  while ( value == flag ) { Sleep(0); }
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* end #elif defined( KOKKOS_HAVE_WINTHREAD ) */
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-
-
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@ -1,427 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_THREADS_PARALLEL_HPP
-#define KOKKOS_THREADS_PARALLEL_HPP
-
-#include <vector>
-
-#include <Kokkos_Parallel.hpp>
-
-#include <impl/Kokkos_StaticAssert.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
-
-  const FunctorType  m_func ;
-  const Policy       m_policy ;
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( i );
-      }
-    }
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( typename PType::work_tag() , i );
-      }
-    }
-
-  static void execute( ThreadsExec & exec , const void * arg )
-  {
-    const ParallelFor & self = * ((const ParallelFor *) arg );
-
-    driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() ) );
-
-    exec.fan_in();
-  }
-
-public:
-
-  ParallelFor( const FunctorType & functor
-             , const Policy      & policy )
-    : m_func( functor )
-    , m_policy( policy )
-    {
-      ThreadsExec::start( & ParallelFor::execute , this );
-
-      ThreadsExec::fence();
-    }
-};
-
-template< class FunctorType , class Arg0 , class Arg1 >
-class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
-{
-private:
-
-  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >  Policy ;
-
-  const FunctorType  m_func ;
-  const Policy       m_policy ;
-  const int          m_shared ;
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION
-  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
-                 const typename Policy::member_type & >::type member ) const
-    { m_func( member ); }
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION
-  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
-                 const typename Policy::member_type & >::type member ) const
-    { m_func( TagType() , member ); }
-
-  static void execute( ThreadsExec & exec , const void * arg )
-  {
-    const ParallelFor & self = * ((const ParallelFor *) arg );
-
-    typename Policy::member_type member( exec , self.m_policy , self.m_shared );
-
-    for ( ; member.valid() ; member.next() ) {
-      self.ParallelFor::template driver< typename Policy::work_tag >( member );
-    }
-
-    exec.fan_in();
-  }
-
-public:
-
-  ParallelFor( const FunctorType & functor
-              , const Policy      & policy )
-    : m_func( functor )
-    , m_policy( policy )
-    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
-    {
-      ThreadsExec::resize_scratch( 0 , Policy::member_type::team_reduce_size() + m_shared );
-
-      ThreadsExec::start( & ParallelFor::execute , this );
-
-      ThreadsExec::fence();
-    }
-};
-
-
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
-  typedef typename Policy::work_tag                                   work_tag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  const FunctorType  m_func ;
-  const Policy       m_policy ;
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , reference_type update
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( i , update );
-      }
-    }
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , reference_type update
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( typename PType::work_tag() , i , update );
-      }
-    }
-
-  static void execute( ThreadsExec & exec , const void * arg )
-  {
-    const ParallelReduce & self = * ((const ParallelReduce *) arg );
-
-    driver( self.m_func
-          , ValueInit::init( self.m_func , exec.reduce_memory() )
-          , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() )
-          );
-
-    exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
-  }
-
-public:
-
-  template< class HostViewType >
-  ParallelReduce( const FunctorType  & functor ,
-                  const Policy       & policy ,
-                  const HostViewType & result_view )
-    : m_func( functor )
-    , m_policy( policy )
-    {
-      ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , 0 );
-
-      ThreadsExec::start( & ParallelReduce::execute , this );
-
-      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
-
-      ThreadsExec::fence();
-
-      if ( result_view.ptr_on_device() ) {
-        const unsigned n = ValueTraits::value_count( m_func );
-        for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
-      }
-    }
-};
-
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class Arg0 , class Arg1 >
-class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
-{
-private:
-
-  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >                 Policy ;
-  typedef typename Policy::work_tag                                   work_tag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  const FunctorType  m_func ;
-  const Policy       m_policy ;
-  const int          m_shared ;
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION
-  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
-                 const typename Policy::member_type & >::type member
-             , reference_type update ) const
-    { m_func( member , update ); }
-
-  template< class TagType >
-  KOKKOS_FORCEINLINE_FUNCTION
-  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
-                 const typename Policy::member_type & >::type member
-             , reference_type update ) const
-    { m_func( TagType() , member , update ); }
-
-  static void execute( ThreadsExec & exec , const void * arg )
-  {
-    const ParallelReduce & self = * ((const ParallelReduce *) arg );
-
-    // Initialize thread-local value
-    reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
-
-    typename Policy::member_type member( exec , self.m_policy , self.m_shared );
-    for ( ; member.valid() ; member.next() ) {
-      self.ParallelReduce::template driver< work_tag >( member , update );
-    }
-
-    exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
-  }
-
-public:
-
-  ParallelReduce( const FunctorType & functor
-                , const Policy      & policy )
-    : m_func( functor )
-    , m_policy( policy )
-    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
-    {
-      ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
-
-      ThreadsExec::start( & ParallelReduce::execute , this );
-
-      ThreadsExec::fence();
-    }
-
-  template< class ViewType >
-  ParallelReduce( const FunctorType & functor
-                , const Policy      & policy
-                , const ViewType    & result )
-    : m_func( functor )
-    , m_policy( policy )
-    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
-    {
-      ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
-
-      ThreadsExec::start( & ParallelReduce::execute , this );
-
-      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
-
-      ThreadsExec::fence();
-
-      const unsigned n = ValueTraits::value_count( m_func );
-      for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
-    }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
-class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
-{
-private:
-
-  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
-  typedef typename Policy::work_tag                                   work_tag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
-
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
-
-  const FunctorType  m_func ;
-  const Policy       m_policy ;
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , reference_type update
-             , const bool    final
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( i , update , final );
-      }
-    }
-
-  template< class PType >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void driver( typename Impl::enable_if<
-                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
-                 , const FunctorType & >::type functor
-             , reference_type update
-             , const bool    final
-             , const PType & range )
-    {
-      const typename PType::member_type e = range.end();
-      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
-        functor( typename PType::work_tag() , i , update , final );
-      }
-    }
-
-  static void execute( ThreadsExec & exec , const void * arg )
-  {
-    const ParallelScan & self = * ((const ParallelScan *) arg );
-
-    const typename Policy::WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
-
-    reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
-
-    driver( self.m_func , update , false , range );
-
-    //  exec.<FunctorType,work_tag>scan_large( self.m_func );
-    exec.template scan_small<FunctorType,work_tag>( self.m_func );
-
-    driver( self.m_func , update , true , range );
-
-    exec.fan_in();
-  }
-
-public:
-
-  ParallelScan( const FunctorType & functor , const Policy & policy )
-    : m_func( functor )
-    , m_policy( policy )
-    {
-      ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_func ) , 0 );
-      ThreadsExec::start( & ParallelScan::execute , this );
-      ThreadsExec::fence();
-    }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
-
--- a/lib/kokkos/core/src/build.cuda.mac
+++ b/lib/kokkos/core/src/build.cuda.mac
@ -1,28 +0,0 @@
-#!/bin/bash
-
-touch KokkosCore_config.h
-
-#flags="-I../ -I./ -I../../../TPL -c -O3 -arch=sm_30 -Xcompiler -fPIC -DKOKKOS_HAVE_CUDA -DKOKKOS_HAVE_PTHREAD --compiler-bindir=/Users/mhoemme/pkg/gcc-4.7.2/bin"
-flags="-I../ -I./ -I../../../TPL -c -O3 -arch=sm_30 -Xcompiler -fPIC -DKOKKOS_HAVE_CUDA -DKOKKOS_HAVE_PTHREAD"
-CC=nvcc
-cd Cuda
-rm *.o
-$CC $flags Kokkos_Cuda_Impl.cu
-$CC $flags Kokkos_CudaSpace.cu
-cd ..
-cd impl
-rm *.o
-$CC $flags Kokkos_hwloc.cpp
-$CC $flags Kokkos_MemoryTracking.cpp
-$CC $flags Kokkos_Shape.cpp
-$CC $flags Kokkos_Error.cpp
-$CC $flags Kokkos_HostSpace.cpp
-$CC $flags Kokkos_Serial.cpp
-cd ..
-cd Threads
-rm *.o
-$CC $flags Kokkos_ThreadsExec.cpp
-$CC $flags Kokkos_ThreadsExec_base.cpp
-cd ..
-$CC -arch=sm_35 -lib -o libkokkoscore-cuda.a Cuda/*.o impl/*.o Threads/*.o
-
--- a/lib/kokkos/core/src/build_common.sh
+++ b/lib/kokkos/core/src/build_common.sh
@ -1,281 +0,0 @@
-#!/bin/bash
-
-#-----------------------------------------------------------------------------
-# Shared portion of build script for the base Kokkos functionality
-# Simple build script with options
-#-----------------------------------------------------------------------------
-if [    ! -d "${KOKKOS}" \
-     -o ! -d "${KOKKOS}/src" \
-     -o ! -d "${KOKKOS}/src/impl" \
-     -o ! -d "${KOKKOS}/src/Cuda" \
-     -o ! -d "${KOKKOS}/src/OpenMP" \
-     -o ! -d "${KOKKOS}/src/Threads" \
-   ] ;
-then
-echo "Must set KOKKOS to the kokkos/core directory"
-exit -1
-fi
-
-#-----------------------------------------------------------------------------
-
-INC_PATH="-I${KOKKOS}/src"
-INC_PATH="${INC_PATH} -I${KOKKOS}/../TPL"
-
-#-----------------------------------------------------------------------------
-
-while [ -n "${1}" ] ; do
-
-ARG="${1}"
-shift 1
-
-case ${ARG} in
-#----------- OPTIONS -----------
-OPT | opt | O3 | -O3 ) OPTFLAGS="${OPTFLAGS} -O3" ;;
-#-------------------------------
-DBG | dbg | g | -g )   KOKKOS_EXPRESSION_CHECK=1 ;;
-#-------------------------------
-HWLOC | hwloc ) KOKKOS_HAVE_HWLOC=${1} ; shift 1 ;;
-#-------------------------------
-MPI | mpi )
-  KOKKOS_HAVE_MPI=${1} ; shift 1
-  CXX="${KOKKOS_HAVE_MPI}/bin/mpicxx"
-  LINK="${KOKKOS_HAVE_MPI}/bin/mpicxx"  
-  INC_PATH="${INC_PATH} -I${KOKKOS_HAVE_MPI}/include"
-  ;;
-#-------------------------------
-OMP | omp | OpenMP )
-  KOKKOS_HAVE_OPENMP=1
-  ;;
-#-------------------------------
-CUDA | Cuda | cuda )
-  # CUDA_ARCH options: 20 30 35
-  CUDA_ARCH=${1} ; shift 1
-  #
-  # -x cu : process all files through the Cuda compiler as Cuda code.
-  # -lib -o : produce library
-  #
-  NVCC="nvcc -gencode arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}"
-  NVCC="${NVCC} -maxrregcount=64"
-  NVCC="${NVCC} -Xcompiler -Wall,-ansi"
-  NVCC="${NVCC} -lib -o libCuda.a -x cu"
-
-  NVCC_SOURCES="${NVCC_SOURCES} ${KOKKOS}/src/Cuda/*.cu"
-  LIB="${LIB} libCuda.a -L/usr/local/cuda/lib64 -lcudart -lcusparse"
-  ;;#-------------------------------
-CUDA_OSX | Cuda_OSX | cuda_osx )
-  # CUDA_ARCH options: 20 30 35
-  CUDA_ARCH=${1} ; shift 1
-  #
-  # -x cu : process all files through the Cuda compiler as Cuda code.
-  # -lib -o : produce library
-  #
-  NVCC="nvcc -gencode arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}"
-  NVCC="${NVCC} -maxrregcount=64"
-  NVCC="${NVCC} -Xcompiler -Wall,-ansi -Xcompiler -m64"
-  NVCC="${NVCC} -lib -o libCuda.a -x cu"
-
-  NVCC_SOURCES="${NVCC_SOURCES} ${KOKKOS}/src/Cuda/*.cu"
-  LIB="${LIB} libCuda.a -Xlinker -rpath -Xlinker /Developer/NVIDIA/CUDA-5.5/lib -L /Developer/NVIDIA/CUDA-5.5/lib -lcudart -lcusparse"
-  ;;
-#-------------------------------
-GNU | gnu | g++ )
-  # Turn on lots of warnings and ansi compliance.
-  # The Trilinos build system requires '-pedantic'
-  # 
-  CXX="g++ -Wall -Wextra -ansi -pedantic"
-  LINK="g++"
-  CXX="${CXX} -rdynamic -DENABLE_TRACEBACK"
-  LIB="${LIB} -ldl"
-  ;;
-#-------------------------------
-GNU_OSX | gnu_osx | g++_osx )
-  # Turn on lots of warnings and ansi compliance.
-  # The Trilinos build system requires '-pedantic'
-  # 
-  CXX="g++ -Wall -Wextra -ansi -pedantic -m64"
-  LINK="g++"
-  CXX="${CXX} -DENABLE_TRACEBACK"
-  LIB="${LIB} -ldl"
-  ;;
-#-------------------------------
-INTEL | intel | icc | icpc )
-  # -xW = use SSE and SSE2 instructions
-  CXX="icpc -Wall"
-  LINK="icpc"
-  LIB="${LIB} -lstdc++"
-  ;;
-#-------------------------------
-MPIINTEL | mpiintel | mpiicc | mpiicpc )
-  # -xW = use SSE and SSE2 instructions
-  CXX="mpiicpc -Wall"
-  LINK="mpiicpc"
-  LIB="${LIB} -lstdc++"
-  KOKKOS_HAVE_MPI=1
-;;
-#-------------------------------
-MIC | mic )
-  CXX="icpc -mmic -ansi-alias -Wall"
-  LINK="icpc -mmic"
-  CXX="${CXX} -mGLOB_default_function_attrs=knc_stream_store_controls=2"
-  # CXX="${CXX} -vec-report6"
-  # CXX="${CXX} -guide-vec"
-  LIB="${LIB} -lstdc++"
-  COMPILE_MIC="on"
-  ;;
-#-------------------------------
-MPIMIC | mpimic )
-  CXX="mpiicpc -mmic -ansi-alias -Wall"
-  LINK="mpiicpc -mmic"
-  KOKKOS_HAVE_MPI=1
-  CXX="${CXX} -mGLOB_default_function_attrs=knc_stream_store_controls=2"
-  # CXX="${CXX} -vec-report6"
-  # CXX="${CXX} -guide-vec"
-  LIB="${LIB} -lstdc++"
-  COMPILE_MIC="on"
-  ;;
-#-------------------------------
-curie )
-  CXX="CC"
-  LINK="CC"
-  INC_PATH="${INC_PATH} -I/opt/cray/mpt/default/gni/mpich2-cray/74"
-  KOKKOS_HAVE_MPI=1
-  ;;  
-#-------------------------------
-MKL | mkl )
-  HAVE_MKL=${1} ; shift 1 ;
-  CXX_FLAGS="${CXX_FLAGS} -DKOKKOS_USE_MKL -I${HAVE_MKL}/include/"
-  ARCH="intel64"
-  if [ -n "${COMPILE_MIC}" ] ;
-  then
-    ARCH="mic"
-  fi
-  LIB="${LIB}  -L${HAVE_MKL}/lib/${ARCH}/ -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core"
-  NVCC_FLAGS="${NVCC_FLAGS} -DKOKKOS_USE_MKL"
-;;
-#-------------------------------
-CUSPARSE | cusparse )
-  CXX_FLAGS="${CXX_FLAGS} -DKOKKOS_USE_CUSPARSE"
-  NVCC_FLAGS="${NVCC_FLAGS} -DKOKKOS_USE_CUSPARSE"
-  LIB="${LIB} -lcusparse"
-;;
-#-------------------------------
-AVX | avx )
-  CXX_FLAGS="${CXX_FLAGS} -mavx"
-;;
-#-------------------------------
-*) echo 'unknown option: ' ${ARG} ; exit -1 ;;
-esac
-done
-
-#-----------------------------------------------------------------------------
-
-if [ -z "${CXX}" ] ;
-then
-  echo "No C++ compiler selected"
-  exit -1
-fi
-
-if [ -n "${KOKKOS_HAVE_OPENMP}" ]
-then
-CXX="${CXX} -fopenmp"
-CXX_SOURCES="${CXX_SOURCES} ${KOKKOS}/src/OpenMP/*.cpp"
-fi
-
-#-----------------------------------------------------------------------------
-# Option for PTHREAD or WINTHREAD eventually
-
-KOKKOS_HAVE_PTHREAD=1
-
-if [ -n "${KOKKOS_HAVE_PTHREAD}" ] ;
-then
-  LIB="${LIB} -lpthread"
-fi
-
-#-----------------------------------------------------------------------------
-# Option for enabling the Serial device
-
-KOKKOS_HAVE_SERIAL=1
-
-#-----------------------------------------------------------------------------
-# Attach options to compile lines
-
-CXX="${CXX} ${OPTFLAGS}"
-
-if [ -n "${NVCC}" ] ;
-then
-  NVCC="${NVCC} ${OPTFLAGS}"
-fi
-
-#-----------------------------------------------------------------------------
-
-CXX_SOURCES="${CXX_SOURCES} ${KOKKOS}/src/impl/*.cpp"
-CXX_SOURCES="${CXX_SOURCES} ${KOKKOS}/src/Threads/*.cpp"
-
-#-----------------------------------------------------------------------------
-#
-
-if [ -n "${KOKKOS_HAVE_HWLOC}" ] ;
-then
-
-  if [ ! -d ${KOKKOS_HAVE_HWLOC} ] ;
-  then
-    echo "${KOKKOS_HAVE_HWLOC} does not exist"
-    exit 1
-  fi
-
-  echo "LD_LIBRARY_PATH must include ${KOKKOS_HAVE_HWLOC}/lib"
-
-  LIB="${LIB} -L${KOKKOS_HAVE_HWLOC}/lib -lhwloc"
-  INC_PATH="${INC_PATH} -I${KOKKOS_HAVE_HWLOC}/include"
-fi
-
-#-----------------------------------------------------------------------------
-
-INC_PATH="${INC_PATH} -I."
-
-CONFIG="KokkosCore_config.h"
-
-rm -f ${CONFIG}
-
-echo "#ifndef KOKKOS_CORE_CONFIG_H" >> ${CONFIG}
-echo "#define KOKKOS_CORE_CONFIG_H" >> ${CONFIG}
-
-if [ -n "${KOKKOS_HAVE_MPI}" ] ;
-then
-  echo "#define KOKKOS_HAVE_MPI" >> ${CONFIG}
-fi
-
-if [ -n "${NVCC}" ] ;
-then
-  echo "#define KOKKOS_HAVE_CUDA" >> ${CONFIG}
-fi
-
-if [ -n "${KOKKOS_HAVE_PTHREAD}" ] ;
-then
-  echo "#define KOKKOS_HAVE_PTHREAD" >> ${CONFIG}
-fi
-
-if [ -n "${KOKKOS_HAVE_SERIAL}" ] ;
-then
-  echo "#define KOKKOS_HAVE_SERIAL" >> ${CONFIG}
-fi
-
-if [ -n "${KOKKOS_HAVE_HWLOC}" ] ;
-then
-  echo "#define KOKKOS_HAVE_HWLOC" >> ${CONFIG}
-fi
-
-if [ -n "${KOKKOS_HAVE_OPENMP}" ] ;
-then
-  echo "#define KOKKOS_HAVE_OPENMP" >> ${CONFIG}
-fi
-
-if [ -n "${KOKKOS_EXPRESSION_CHECK}" ] ;
-then
-  echo "#define KOKKOS_EXPRESSION_CHECK" >> ${CONFIG}
-fi
-
-echo "#endif" >> ${CONFIG}
-
-#-----------------------------------------------------------------------------
-
--- a/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
@ -1,260 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_ANALYZESHAPE_HPP
-#define KOKKOS_ANALYZESHAPE_HPP
-
-#include <impl/Kokkos_Shape.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-/** \brief  Analyze the array shape defined by a Kokkos::View data type.
- *
- *  It is presumed that the data type can be mapped down to a multidimensional
- *  array of an intrinsic scalar numerical type (double, float, int, ... ).
- *  The 'value_type' of an array may be an embedded aggregate type such
- *  as a fixed length array 'Array<T,N>'.
- *  In this case the 'array_intrinsic_type' represents the
- *  underlying array of intrinsic scalar numerical type.
- *
- *  The embedded aggregate type must have an AnalyzeShape specialization
- *  to map it down to a shape and intrinsic scalar numerical type.
- */
-template< class T >
-struct AnalyzeShape : public Shape< sizeof(T) , 0 >
-{
-  typedef void specialize ;
-
-  typedef Shape< sizeof(T), 0 >  shape ;
-
-  typedef       T  array_intrinsic_type ;
-  typedef       T  value_type ;
-  typedef       T  type ;
-
-  typedef const T  const_array_intrinsic_type ;
-  typedef const T  const_value_type ;
-  typedef const T  const_type ;
-
-  typedef       T  non_const_array_intrinsic_type ;
-  typedef       T  non_const_value_type ;
-  typedef       T  non_const_type ;
-};
-
-template<>
-struct AnalyzeShape<void> : public Shape< 0 , 0 >
-{
-  typedef void specialize ;
-
-  typedef Shape< 0 , 0 >  shape ;
-
-  typedef       void  array_intrinsic_type ;
-  typedef       void  value_type ;
-  typedef       void  type ;
-  typedef const void  const_array_intrinsic_type ;
-  typedef const void  const_value_type ;
-  typedef const void  const_type ;
-  typedef       void  non_const_array_intrinsic_type ;
-  typedef       void  non_const_value_type ;
-  typedef       void  non_const_type ;
-};
-
-template< class T >
-struct AnalyzeShape< const T > : public AnalyzeShape<T>::shape
-{
-private:
-  typedef AnalyzeShape<T> nested ;
-public:
-
-  typedef typename nested::specialize specialize ;
-
-  typedef typename nested::shape shape ;
-
-  typedef typename nested::const_array_intrinsic_type  array_intrinsic_type ;
-  typedef typename nested::const_value_type            value_type ;
-  typedef typename nested::const_type                  type ;
-
-  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type ;
-  typedef typename nested::const_value_type            const_value_type ;
-  typedef typename nested::const_type                  const_type ;
-
-  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type ;
-  typedef typename nested::non_const_value_type            non_const_value_type ;
-  typedef typename nested::non_const_type                  non_const_type ;
-};
-
-template< class T >
-struct AnalyzeShape< T * >
-  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
-{
-private:
-  typedef AnalyzeShape<T> nested ;
-public:
-
-  typedef typename nested::specialize specialize ;
-
-  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
-
-  typedef typename nested::array_intrinsic_type * array_intrinsic_type ;
-  typedef typename nested::value_type             value_type ;
-  typedef typename nested::type                 * type ;
-
-  typedef typename nested::const_array_intrinsic_type * const_array_intrinsic_type ;
-  typedef typename nested::const_value_type             const_value_type ;
-  typedef typename nested::const_type                 * const_type ;
-
-  typedef typename nested::non_const_array_intrinsic_type * non_const_array_intrinsic_type ;
-  typedef typename nested::non_const_value_type             non_const_value_type ;
-  typedef typename nested::non_const_type                 * non_const_type ;
-};
-
-template< class T >
-struct AnalyzeShape< T[] >
-  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
-{
-private:
-  typedef AnalyzeShape<T> nested ;
-public:
-
-  typedef typename nested::specialize specialize ;
-
-  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
-
-  typedef typename nested::array_intrinsic_type  array_intrinsic_type [] ;
-  typedef typename nested::value_type            value_type ;
-  typedef typename nested::type                  type [] ;
-
-  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [] ;
-  typedef typename nested::const_value_type            const_value_type ;
-  typedef typename nested::const_type                  const_type [] ;
-
-  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [] ;
-  typedef typename nested::non_const_value_type            non_const_value_type ;
-  typedef typename nested::non_const_type                  non_const_type [] ;
-};
-
-template< class T >
-struct AnalyzeShape< const T[] >
-  : public ShapeInsert< typename AnalyzeShape< const T >::shape , 0 >::type
-{
-private:
-  typedef AnalyzeShape< const T > nested ;
-public:
-
-  typedef typename nested::specialize specialize ;
-
-  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
-
-  typedef typename nested::array_intrinsic_type  array_intrinsic_type [] ;
-  typedef typename nested::value_type            value_type ;
-  typedef typename nested::type                  type [] ;
-
-  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [] ;
-  typedef typename nested::const_value_type            const_value_type ;
-  typedef typename nested::const_type                  const_type [] ;
-
-  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [] ;
-  typedef typename nested::non_const_value_type            non_const_value_type ;
-  typedef typename nested::non_const_type                  non_const_type [] ;
-};
-
-template< class T , unsigned N >
-struct AnalyzeShape< T[N] >
-  : public ShapeInsert< typename AnalyzeShape<T>::shape , N >::type
-{
-private:
-  typedef AnalyzeShape<T> nested ;
-public:
-
-  typedef typename nested::specialize specialize ;
-
-  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
-
-  typedef typename nested::array_intrinsic_type  array_intrinsic_type [N] ;
-  typedef typename nested::value_type            value_type ;
-  typedef typename nested::type                  type [N] ;
-
-  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [N] ;
-  typedef typename nested::const_value_type            const_value_type ;
-  typedef typename nested::const_type                  const_type [N] ;
-
-  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [N] ;
-  typedef typename nested::non_const_value_type            non_const_value_type ;
-  typedef typename nested::non_const_type                  non_const_type [N] ;
-};
-
-template< class T , unsigned N >
-struct AnalyzeShape< const T[N] >
-  : public ShapeInsert< typename AnalyzeShape< const T >::shape , N >::type
-{
-private:
-  typedef AnalyzeShape< const T > nested ;
-public:
-
-  typedef typename nested::specialize specialize ;
-
-  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
-
-  typedef typename nested::array_intrinsic_type  array_intrinsic_type [N] ;
-  typedef typename nested::value_type            value_type ;
-  typedef typename nested::type                  type [N] ;
-
-  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [N] ;
-  typedef typename nested::const_value_type            const_value_type ;
-  typedef typename nested::const_type                  const_type [N] ;
-
-  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [N] ;
-  typedef typename nested::non_const_value_type            non_const_value_type ;
-  typedef typename nested::non_const_type                  non_const_type [N] ;
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* #ifndef KOKKOS_ANALYZESHAPE_HPP */
-
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp
@ -1,176 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_X86_HPP )
-#define KOKKOS_ATOMIC_ASSEMBLY_X86_HPP
-namespace Kokkos {
-
-#ifndef __CUDA_ARCH__
-template<>
-KOKKOS_INLINE_FUNCTION
-void atomic_increment<char>(volatile char* a) {
-  __asm__ __volatile__(
-    "lock incb %0"
-    : /* no output registers */
-    : "m" (a[0])
-    : "memory"
-  );
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-void atomic_increment<short>(volatile short* a) {
-  __asm__ __volatile__(
-    "lock incw %0"
-    : /* no output registers */
-    : "m" (a[0])
-    : "memory"
-  );
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-void atomic_increment<int>(volatile int* a) {
-  __asm__ __volatile__(
-    "lock incl %0"
-    : /* no output registers */
-    : "m" (a[0])
-    : "memory"
-  );
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-void atomic_increment<long long int>(volatile long long int* a) {
-  __asm__ __volatile__(
-    "lock incq %0"
-    : /* no output registers */
-    : "m" (a[0])
-    : "memory"
-  );
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-void atomic_decrement<char>(volatile char* a) {
-  __asm__ __volatile__(
-    "lock decb %0"
-    : /* no output registers */
-    : "m" (a[0])
-    : "memory"
-  );
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-void atomic_decrement<short>(volatile short* a) {
-  __asm__ __volatile__(
-    "lock decw %0"
-    : /* no output registers */
-    : "m" (a[0])
-    : "memory"
-  );
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-void atomic_decrement<int>(volatile int* a) {
-  __asm__ __volatile__(
-    "lock decl %0"
-    : /* no output registers */
-    : "m" (a[0])
-    : "memory"
-  );
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-void atomic_decrement<long long int>(volatile long long int* a) {
-  __asm__ __volatile__(
-    "lock decq %0"
-    : /* no output registers */
-    : "m" (a[0])
-    : "memory"
-  );
-}
-#endif
-
-namespace Impl {
-  struct cas128_t
-  {
-    uint64_t lower;
-    uint64_t upper;
-    KOKKOS_INLINE_FUNCTION
-    bool operator != (const cas128_t& a) const {
-      return (lower != a.lower) || upper!=a.upper;
-    }
-  }
-  __attribute__ (( __aligned__( 16 ) ));
-
-
-
-
-  inline cas128_t cas128( volatile cas128_t * ptr, cas128_t cmp,  cas128_t swap )
-  {
-    bool swapped;
-    __asm__ __volatile__
-    (
-     "lock cmpxchg16b %1\n\t"
-     "setz %0"
-     : "=q" ( swapped )
-     , "+m" ( *ptr )
-     , "+d" ( cmp.upper )
-     , "+a" ( cmp.lower )
-     : "c" ( swap.upper )
-     , "b" ( swap.lower )
-     : "cc"
-    );
-    (void) swapped;
-    return cmp;
-  }
-
-}
-}
-
-#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@ -1,231 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
-#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-// Cuda native CAS supports int, unsigned int, and unsigned long long int (non-standard type).
-// Must cast-away 'volatile' for the CAS call.
-
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-
-__inline__ __device__
-int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
-{ return atomicCAS((int*)dest,compare,val); }
-
-__inline__ __device__
-unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val)
-{ return atomicCAS((unsigned int*)dest,compare,val); }
-
-__inline__ __device__
-unsigned long long int atomic_compare_exchange( volatile unsigned long long int * const dest ,
-                                                const unsigned long long int compare ,
-                                                const unsigned long long int val )
-{ return atomicCAS((unsigned long long int*)dest,compare,val); }
-
-template < typename T >
-__inline__ __device__
-T atomic_compare_exchange( volatile T * const dest , const T & compare ,
-  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
-{
-  const int tmp = atomicCAS( (int*) dest , *((int*)&compare) , *((int*)&val) );
-  return *((T*)&tmp);
-}
-
-template < typename T >
-__inline__ __device__
-T atomic_compare_exchange( volatile T * const dest , const T & compare ,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
-{
-  typedef unsigned long long int type ;
-  const type tmp = atomicCAS( (type*) dest , *((type*)&compare) , *((type*)&val) );
-  return *((T*)&tmp);
-}
-
-template < typename T >
-__inline__ __device__
-T atomic_compare_exchange( volatile T * const dest , const T & compare ,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) != sizeof(unsigned long long int) &&
-                                    sizeof(T) == sizeof(Impl::cas128_t) , const T & >::type val )
-{
-  Kokkos::abort("Error: calling atomic_compare_exchange with 128bit type is not supported on CUDA execution space.");
-  return T();
-}
-
-//----------------------------------------------------------------------------
-// GCC native CAS supports int, long, unsigned int, unsigned long.
-// Intel native CAS support int and long with the same interface as GCC.
-
-#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
-
-KOKKOS_INLINE_FUNCTION
-int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
-{ return __sync_val_compare_and_swap(dest,compare,val); }
-
-KOKKOS_INLINE_FUNCTION
-long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
-{ return __sync_val_compare_and_swap(dest,compare,val); }
-
-#if defined( KOKKOS_ATOMICS_USE_GCC )
-
-// GCC supports unsigned
-
-KOKKOS_INLINE_FUNCTION
-unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val )
-{ return __sync_val_compare_and_swap(dest,compare,val); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned long atomic_compare_exchange( volatile unsigned long * const dest ,
-                                       const unsigned long compare ,
-                                       const unsigned long val )
-{ return __sync_val_compare_and_swap(dest,compare,val); }
-
-#endif
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_compare_exchange( volatile T * const dest, const T & compare,
-  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
-{
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    int i ;
-    T t ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } tmp ;
-#else
-  union U {
-    int i ;
-    T t ;
-  } tmp ;
-#endif
-
-  tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
-  return tmp.t ;
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_compare_exchange( volatile T * const dest, const T & compare,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) == sizeof(long) , const T & >::type val )
-{
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    long i ;
-    T t ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } tmp ;
-#else
-  union U {
-    long i ;
-    T t ;
-  } tmp ;
-#endif
-
-  tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
-  return tmp.t ;
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_compare_exchange( volatile T * const dest, const T & compare,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) != sizeof(long) &&
-                                    sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val )
-{
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    Impl::cas128_t i ;
-    T t ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } tmp ;
-#else
-  union U {
-    Impl::cas128_t i ;
-    T t ;
-  } tmp ;
-#endif
-
-  tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
-  return tmp.t ;
-}
-//----------------------------------------------------------------------------
-
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-
-template< typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_compare_exchange( volatile T * const dest, const T compare, const T val )
-{
-  T retval;
-#pragma omp critical
-  {
-    retval = dest[0];
-    if ( retval == compare )
-  	dest[0] = val;
-  }
-  return retval;
-}
-
-#endif
-
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION
-bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
-{
-  return compare == atomic_compare_exchange(dest, compare, val);
-}
-
-//----------------------------------------------------------------------------
-
-} // namespace Kokkos
-
-#endif
-
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@ -1,305 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
-#define KOKKOS_ATOMIC_EXCHANGE_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-
-__inline__ __device__
-int atomic_exchange( volatile int * const dest , const int val )
-{
-  // return __iAtomicExch( (int*) dest , val );
-  return atomicExch( (int*) dest , val );
-}
-
-__inline__ __device__
-unsigned int atomic_exchange( volatile unsigned int * const dest , const unsigned int val )
-{
-  // return __uAtomicExch( (unsigned int*) dest , val );
-  return atomicExch( (unsigned int*) dest , val );
-}
-
-__inline__ __device__
-unsigned long long int atomic_exchange( volatile unsigned long long int * const dest , const unsigned long long int val )
-{
-  // return __ullAtomicExch( (unsigned long long*) dest , val );
-  return atomicExch( (unsigned long long*) dest , val );
-}
-
-/** \brief  Atomic exchange for any type with compatible size */
-template< typename T >
-__inline__ __device__
-T atomic_exchange(
-  volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
-{
-  // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
-  int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
-  return *((T*)&tmp);
-}
-
-template< typename T >
-__inline__ __device__
-T atomic_exchange(
-  volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
-{
-  typedef unsigned long long int type ;
-  // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
-  type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
-  return *((T*)&tmp);
-}
-
-template< typename T >
-__inline__ __device__
-T atomic_exchange(
-  volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) != sizeof(unsigned long long int) &&
-                                    sizeof(T) == sizeof(Impl::cas128_t) , const T & >::type val )
-{
-  Kokkos::abort("Error: calling atomic_exchange with 128bit type is not supported on CUDA execution space.");
-  return T();
-}
-
-/** \brief  Atomic exchange for any type with compatible size */
-template< typename T >
-__inline__ __device__
-void atomic_assign(
-  volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
-{
-  // (void) __ullAtomicExch( (int*) dest , *((int*)&val) );
-  (void) atomicExch( ((int*)dest) , *((int*)&val) );
-}
-
-template< typename T >
-__inline__ __device__
-void atomic_assign(
-  volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
-{
-  typedef unsigned long long int type ;
-  // (void) __ullAtomicExch( (type*) dest , *((type*)&val) );
-  (void) atomicExch( ((type*)dest) , *((type*)&val) );
-}
-
-template< typename T >
-__inline__ __device__
-void atomic_assign(
-  volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) != sizeof(unsigned long long int) &&
-                                    sizeof(T) == sizeof(Impl::cas128_t) , const T & >::type val )
-{
-  Kokkos::abort("Error: calling atomic_assign with 128bit type is not supported on CUDA execution space.");
-}
-
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
-
-template< typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_exchange( volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
-                                  , const T & >::type val )
-{
-  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
-
-  const type v = *((type*)&val); // Extract to be sure the value doesn't change
-
-  type assumed ;
-
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    T val_T ;
-    type val_type ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } old ;
-#else
-  union { T val_T ; type val_type ; } old ;
-#endif
-
-  old.val_T = *dest ;
-
-  do {
-    assumed = old.val_type ;
-    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
-  } while ( assumed != old.val_type );
-
-  return old.val_T ;
-}
-
-template< typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_exchange( volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
-                                  , const T & >::type val )
-{
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    Impl::cas128_t i ;
-    T t ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } assume , oldval , newval ;
-#else
-  union U {
-    Impl::cas128_t i ;
-    T t ;
-  } assume , oldval , newval ;
-#endif
-
-  oldval.t = *dest ;
-  newval.t = val;
-
-  do {
-    assume.i = oldval.i ;
-    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return oldval.t ;
-}
-
-template< typename T >
-KOKKOS_INLINE_FUNCTION
-void atomic_assign( volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
-                                  , const T & >::type val )
-{
-  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
-
-  const type v = *((type*)&val); // Extract to be sure the value doesn't change
-
-  type assumed ;
-
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    T val_T ;
-    type val_type ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } old ;
-#else
-  union { T val_T ; type val_type ; } old ;
-#endif
-
-  old.val_T = *dest ;
-
-  do {
-    assumed = old.val_type ;
-    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
-  } while ( assumed != old.val_type );
-}
-
-template< typename T >
-KOKKOS_INLINE_FUNCTION
-void atomic_assign( volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
-                                  , const T & >::type val )
-{
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    Impl::cas128_t i ;
-    T t ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } assume , oldval , newval ;
-#else
-  union U {
-    Impl::cas128_t i ;
-    T t ;
-  } assume , oldval , newval ;
-#endif
-
-  oldval.t = *dest ;
-  newval.t = val;
-  do {
-    assume.i = oldval.i ;
-    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i);
-  } while ( assume.i != oldval.i );
-}
-//----------------------------------------------------------------------------
-
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_exchange( volatile T * const dest , const T val )
-{
-  T retval;
-//#pragma omp atomic capture
-  #pragma omp critical
-  {
-    retval = dest[0];
-    dest[0] = val;
-  }
-  return retval;
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-void atomic_assign( volatile T * const dest , const T val )
-{
-//#pragma omp atomic
-  #pragma omp critical
-  {
-    dest[0] = val;
-  }
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-} // namespace Kokkos
-
-#endif
-
-//----------------------------------------------------------------------------
-
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@ -1,297 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
-#define KOKKOS_ATOMIC_FETCH_ADD_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-
-// Support for int, unsigned int, unsigned long long int, and float
-
-__inline__ __device__
-int atomic_fetch_add( volatile int * const dest , const int val )
-{ return atomicAdd((int*)dest,val); }
-
-__inline__ __device__
-unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
-{ return atomicAdd((unsigned int*)dest,val); }
-
-__inline__ __device__
-unsigned long long int atomic_fetch_add( volatile unsigned long long int * const dest ,
-                                         const unsigned long long int val )
-{ return atomicAdd((unsigned long long int*)dest,val); }
-
-__inline__ __device__
-float atomic_fetch_add( volatile float * const dest , const float val )
-{ return atomicAdd((float*)dest,val); }
-
-template < typename T >
-__inline__ __device__
-T atomic_fetch_add( volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
-{
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    int i ;
-    T t ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } assume , oldval , newval ;
-#else
-  union U {
-    int i ;
-    T t ;
-  } assume , oldval , newval ;
-#endif
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = assume.t + val ;
-    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
-  } while ( assumed.i != oldval.i );
-
-  return oldval.t ;
-}
-
-template < typename T >
-__inline__ __device__
-T atomic_fetch_add( volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
-{
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    unsigned long long int i ;
-    T t ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } assume , oldval , newval ;
-#else
-  union U {
-    unsigned long long int i ;
-    T t ;
-  } assume , oldval , newval ;
-#endif
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = assume.t + val ;
-    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return oldval.t ;
-}
-
-template < typename T >
-__inline__ __device__
-T atomic_fetch_add( volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) != sizeof(unsigned long long int) &&
-                                    sizeof(T) == sizeof(Impl::cas128_t), const T >::type val )
-{
-  Kokkos::abort("Error: calling atomic_fetch_add with 128bit type is not supported on CUDA execution space.");
-  return T();
-}
-
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
-
-KOKKOS_INLINE_FUNCTION
-int atomic_fetch_add( volatile int * const dest , const int val )
-{ return __sync_fetch_and_add(dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-long int atomic_fetch_add( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_add(dest,val); }
-
-#if defined( KOKKOS_ATOMICS_USE_GCC )
-
-KOKKOS_INLINE_FUNCTION
-unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_add(dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_add(dest,val); }
-
-#endif
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_add( volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
-{
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    int i ;
-    T t ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } assume , oldval , newval ;
-#else
-  union U {
-    int i ;
-    T t ;
-  } assume , oldval , newval ;
-#endif
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = assume.t + val ;
-    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return oldval.t ;
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_add( volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) == sizeof(long) , const T >::type val )
-{
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    long i ;
-    T t ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } assume , oldval , newval ;
-#else
-  union U {
-    long i ;
-    T t ;
-  } assume , oldval , newval ;
-#endif
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = assume.t + val ;
-    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return oldval.t ;
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_add( volatile T * const dest ,
-  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) != sizeof(long) &&
-                                    sizeof(T) == sizeof(Impl::cas128_t) , const T >::type val )
-{
-#ifdef KOKKOS_HAVE_CXX11
-  union U {
-    Impl::cas128_t i ;
-    T t ;
-    KOKKOS_INLINE_FUNCTION U() {};
-  } assume , oldval , newval ;
-#else
-  union U {
-    Impl::cas128_t i ;
-    T t ;
-  } assume , oldval , newval ;
-#endif
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = assume.t + val ;
-    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return oldval.t ;
-}
-//----------------------------------------------------------------------------
-
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-
-template< typename T >
-T atomic_fetch_add( volatile T * const dest , const T val )
-{
-  T retval;
-#pragma omp atomic capture
-  {
-    retval = dest[0];
-    dest[0] += val;
-  }
-  return retval;
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-// Simpler version of atomic_fetch_add without the fetch
-template <typename T>
-KOKKOS_INLINE_FUNCTION
-void atomic_add(volatile T * const dest, const T src) {
-  atomic_fetch_add(dest,src);
-}
-
-// Atomic increment
-template<typename T>
-KOKKOS_INLINE_FUNCTION
-void atomic_increment(volatile T* a) {
-  Kokkos::atomic_fetch_add(a,1);
-}
-
-template<typename T>
-KOKKOS_INLINE_FUNCTION
-void atomic_decrement(volatile T* a) {
-  Kokkos::atomic_fetch_add(a,-1);
-}
-
-}
-#endif
-
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
@ -1,125 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
-#define KOKKOS_ATOMIC_FETCH_AND_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-
-// Support for int, unsigned int, unsigned long long int, and float
-
-__inline__ __device__
-int atomic_fetch_and( volatile int * const dest , const int val )
-{ return atomicAnd((int*)dest,val); }
-
-__inline__ __device__
-unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
-{ return atomicAnd((unsigned int*)dest,val); }
-
-#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
-__inline__ __device__
-unsigned long long int atomic_fetch_and( volatile unsigned long long int * const dest ,
-                                         const unsigned long long int val )
-{ return atomicAnd((unsigned long long int*)dest,val); }
-#endif
-
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
-
-KOKKOS_INLINE_FUNCTION
-int atomic_fetch_and( volatile int * const dest , const int val )
-{ return __sync_fetch_and_and(dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-long int atomic_fetch_and( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_and(dest,val); }
-
-#if defined( KOKKOS_ATOMICS_USE_GCC )
-
-KOKKOS_INLINE_FUNCTION
-unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_and(dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_and(dest,val); }
-
-#endif
-
-//----------------------------------------------------------------------------
-
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-
-template< typename T >
-T atomic_fetch_and( volatile T * const dest , const T val )
-{
-  T retval;
-#pragma omp atomic capture
-  {
-    retval = dest[0];
-    dest[0] &= val;
-  }
-  return retval;
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-// Simpler version of atomic_fetch_and without the fetch
-template <typename T>
-KOKKOS_INLINE_FUNCTION
-void atomic_and(volatile T * const dest, const T src) {
-  (void)atomic_fetch_and(dest,src);
-}
-
-}
-
-#endif
-
-
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
@ -1,125 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
-#define KOKKOS_ATOMIC_FETCH_OR_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-
-// Support for int, unsigned int, unsigned long long int, and float
-
-__inline__ __device__
-int atomic_fetch_or( volatile int * const dest , const int val )
-{ return atomicOr((int*)dest,val); }
-
-__inline__ __device__
-unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
-{ return atomicOr((unsigned int*)dest,val); }
-
-#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
-__inline__ __device__
-unsigned long long int atomic_fetch_or( volatile unsigned long long int * const dest ,
-                                         const unsigned long long int val )
-{ return atomicOr((unsigned long long int*)dest,val); }
-#endif
-
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
-
-KOKKOS_INLINE_FUNCTION
-int atomic_fetch_or( volatile int * const dest , const int val )
-{ return __sync_fetch_and_or(dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-long int atomic_fetch_or( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_or(dest,val); }
-
-#if defined( KOKKOS_ATOMICS_USE_GCC )
-
-KOKKOS_INLINE_FUNCTION
-unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_or(dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_or(dest,val); }
-
-#endif
-
-//----------------------------------------------------------------------------
-
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-
-template< typename T >
-T atomic_fetch_or( volatile T * const dest , const T val )
-{
-  T retval;
-#pragma omp atomic capture
-  {
-    retval = dest[0];
-    dest[0] |= val;
-  }
-  return retval;
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-// Simpler version of atomic_fetch_or without the fetch
-template <typename T>
-KOKKOS_INLINE_FUNCTION
-void atomic_or(volatile T * const dest, const T src) {
-  (void)atomic_fetch_or(dest,src);
-}
-
-}
-
-#endif
-
-
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@ -1,383 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
-#define KOKKOS_ATOMIC_GENERIC_HPP
-#include <Kokkos_Macros.hpp>
-
-// Combination operands to be used in an Compare and Exchange based atomic operation
-namespace Kokkos {
-namespace Impl {
-
-template<class Scalar1, class Scalar2>
-struct AddOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1+val2;
-  }
-};
-
-template<class Scalar1, class Scalar2>
-struct SubOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1-val2;
-  }
-};
-
-template<class Scalar1, class Scalar2>
-struct MulOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1*val2;
-  }
-};
-
-template<class Scalar1, class Scalar2>
-struct DivOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1/val2;
-  }
-};
-
-template<class Scalar1, class Scalar2>
-struct ModOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1%val2;
-  }
-};
-
-template<class Scalar1, class Scalar2>
-struct AndOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1&val2;
-  }
-};
-
-template<class Scalar1, class Scalar2>
-struct OrOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1|val2;
-  }
-};
-
-template<class Scalar1, class Scalar2>
-struct XorOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1^val2;
-  }
-};
-
-template<class Scalar1, class Scalar2>
-struct LShiftOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1<<val2;
-  }
-};
-
-template<class Scalar1, class Scalar2>
-struct RShiftOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1>>val2;
-  }
-};
-
-template < class Oper, typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
-  typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
-{
-  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = Oper::apply(assume.t, val) ;
-    oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return oldval.t ;
-}
-
-template < class Oper, typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
-  typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
-                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
-{
-  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = Oper::apply(assume.t, val) ;
-    oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return newval.t ;
-}
-
-template < class Oper, typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
-  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
-{
-  union { int i ; T t ; } oldval , assume , newval ;
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = Oper::apply(assume.t, val) ;
-    oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return oldval.t ;
-}
-
-template < class Oper, typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
-  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int), const T >::type val )
-{
-  union { int i ; T t ; } oldval , assume , newval ;
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = Oper::apply(assume.t, val) ;
-    oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return newval.t ;
-}
-
-/*template < class Oper, typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
-  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(short) , const T >::type val )
-{
-  union { short i ; T t ; } oldval , assume , newval ;
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = Oper::apply(assume.t, val) ;
-    oldval.i = ::Kokkos::atomic_compare_exchange( (short*)dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return oldval.t ;
-}
-
-template < class Oper, typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
-  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(short), const T >::type val )
-{
-  union { short i ; T t ; } oldval , assume , newval ;
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = Oper::apply(assume.t, val) ;
-    oldval.i = ::Kokkos::atomic_compare_exchange( (short*)dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return newval.t ;
-}
-
-template < class Oper, typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
-  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(char) , const T >::type val )
-{
-  union { char i ; T t ; } oldval , assume , newval ;
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = Oper::apply(assume.t, val) ;
-    oldval.i = ::Kokkos::atomic_compare_exchange( (char*)dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return oldval.t ;
-}
-
-template < class Oper, typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
-  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(char), const T >::type val )
-{
-  union { char i ; T t ; } oldval , assume , newval ;
-
-  oldval.t = *dest ;
-
-  do {
-    assume.i = oldval.i ;
-    newval.t = Oper::apply(assume.t, val) ;
-    oldval.i = ::Kokkos::atomic_compare_exchange( (char*)dest , assume.i , newval.i );
-  } while ( assume.i != oldval.i );
-
-  return newval.t ;
-}*/
-
-}
-}
-
-namespace Kokkos {
-
-// Fetch_Oper atomics: return value before operation
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_mul(volatile T * const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::MulOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_div(volatile T * const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::DivOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_mod(volatile T * const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::ModOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_and(volatile T * const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::AndOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_or(volatile T * const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::OrOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_xor(volatile T * const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::XorOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_lshift(volatile T * const dest, const unsigned int val) {
-  return Impl::atomic_fetch_oper(Impl::LShiftOper<T,const unsigned int>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
-  return Impl::atomic_fetch_oper(Impl::RShiftOper<T,const unsigned int>(),dest,val);
-}
-
-
-// Oper Fetch atomics: return value after operation
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_mul_fetch(volatile T * const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::MulOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_div_fetch(volatile T * const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::DivOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_mod_fetch(volatile T * const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::ModOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_and_fetch(volatile T * const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::AndOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_or_fetch(volatile T * const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::OrOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_xor_fetch(volatile T * const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::XorOper<T,const T>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_lshift_fetch(volatile T * const dest, const unsigned int val) {
-  return Impl::atomic_oper_fetch(Impl::LShiftOper<T,const unsigned int>(),dest,val);
-}
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
-  return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
-}
-
-
-}
-#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@ -1,448 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-#ifndef KOKKOS_ATOMIC_VIEW_HPP
-#define KOKKOS_ATOMIC_VIEW_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_Atomic.hpp>
-namespace Kokkos {
-namespace Impl {
-
-//The following tag is used to prevent an implicit call of the constructor when trying
-//to assign a literal 0 int ( = 0 );
-struct AtomicViewConstTag {};
-
-template<class ViewTraits>
-class AtomicDataElement {
-public:
-  typedef typename ViewTraits::value_type value_type;
-  typedef typename ViewTraits::const_value_type const_value_type;
-  typedef typename ViewTraits::non_const_value_type non_const_value_type;
-  volatile value_type* const ptr;
-
-  KOKKOS_INLINE_FUNCTION
-  AtomicDataElement(value_type* ptr_, AtomicViewConstTag ):ptr(ptr_){}
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator = (const_value_type& val) const {
-    *ptr = val;
-    return val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator = (volatile const_value_type& val) const {
-    *ptr = val;
-    return val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void inc() const {
-    Kokkos::atomic_increment(ptr);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void dec() const {
-    Kokkos::atomic_decrement(ptr);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator ++ () const {
-    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,1);
-    return tmp+1;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator -- () const {
-    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-1);
-    return tmp-1;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator ++ (int) const {
-    return Kokkos::atomic_fetch_add(ptr,1);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator -- (int) const {
-    return Kokkos::atomic_fetch_add(ptr,-1);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator += (const_value_type& val) const {
-    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
-    return tmp+val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator += (volatile const_value_type& val) const {
-    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
-    return tmp+val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator -= (const_value_type& val) const {
-    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
-    return tmp-val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator -= (volatile const_value_type& val) const {
-    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
-    return tmp-val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator *= (const_value_type& val) const {
-    return Kokkos::atomic_mul_fetch(ptr,val);
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator *= (volatile const_value_type& val) const {
-    return Kokkos::atomic_mul_fetch(ptr,val);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator /= (const_value_type& val) const {
-    return Kokkos::atomic_div_fetch(ptr,val);
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator /= (volatile const_value_type& val) const {
-    return Kokkos::atomic_div_fetch(ptr,val);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator %= (const_value_type& val) const {
-    return Kokkos::atomic_mod_fetch(ptr,val);
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator %= (volatile const_value_type& val) const {
-    return Kokkos::atomic_mod_fetch(ptr,val);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator &= (const_value_type& val) const {
-    return Kokkos::atomic_and_fetch(ptr,val);
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator &= (volatile const_value_type& val) const {
-    return Kokkos::atomic_and_fetch(ptr,val);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator ^= (const_value_type& val) const {
-    return Kokkos::atomic_xor_fetch(ptr,val);
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator ^= (volatile const_value_type& val) const {
-    return Kokkos::atomic_xor_fetch(ptr,val);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator |= (const_value_type& val) const {
-    return Kokkos::atomic_or_fetch(ptr,val);
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator |= (volatile const_value_type& val) const {
-    return Kokkos::atomic_or_fetch(ptr,val);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator <<= (const_value_type& val) const {
-    return Kokkos::atomic_lshift_fetch(ptr,val);
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator <<= (volatile const_value_type& val) const {
-    return Kokkos::atomic_lshift_fetch(ptr,val);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator >>= (const_value_type& val) const {
-    return Kokkos::atomic_rshift_fetch(ptr,val);
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator >>= (volatile const_value_type& val) const {
-    return Kokkos::atomic_rshift_fetch(ptr,val);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator + (const_value_type& val) const {
-    return *ptr+val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator + (volatile const_value_type& val) const {
-    return *ptr+val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator - (const_value_type& val) const {
-    return *ptr-val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator - (volatile const_value_type& val) const {
-    return *ptr-val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator * (const_value_type& val) const {
-    return *ptr*val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator * (volatile const_value_type& val) const {
-    return *ptr*val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator / (const_value_type& val) const {
-    return *ptr/val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator / (volatile const_value_type& val) const {
-    return *ptr/val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator % (const_value_type& val) const {
-    return *ptr^val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator % (volatile const_value_type& val) const {
-    return *ptr^val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator ! () const {
-    return !*ptr;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator && (const_value_type& val) const {
-    return *ptr&&val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator && (volatile const_value_type& val) const {
-    return *ptr&&val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator || (const_value_type& val) const {
-    return *ptr|val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator || (volatile const_value_type& val) const {
-    return *ptr|val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator & (const_value_type& val) const {
-    return *ptr&val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator & (volatile const_value_type& val) const {
-    return *ptr&val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator | (const_value_type& val) const {
-    return *ptr|val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator | (volatile const_value_type& val) const {
-    return *ptr|val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator ^ (const_value_type& val) const {
-    return *ptr^val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator ^ (volatile const_value_type& val) const {
-    return *ptr^val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator ~ () const {
-    return ~*ptr;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator << (const unsigned int& val) const {
-    return *ptr<<val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator << (volatile const unsigned int& val) const {
-    return *ptr<<val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator >> (const unsigned int& val) const {
-    return *ptr>>val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  const_value_type operator >> (volatile const unsigned int& val) const {
-    return *ptr>>val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  bool operator == (const_value_type& val) const {
-    return *ptr == val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  bool operator == (volatile const_value_type& val) const {
-    return *ptr == val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  bool operator != (const_value_type& val) const {
-    return *ptr != val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  bool operator != (volatile const_value_type& val) const {
-    return *ptr != val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  bool operator >= (const_value_type& val) const {
-    return *ptr >= val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  bool operator >= (volatile const_value_type& val) const {
-    return *ptr >= val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  bool operator <= (const_value_type& val) const {
-    return *ptr <= val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  bool operator <= (volatile const_value_type& val) const {
-    return *ptr <= val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  bool operator < (const_value_type& val) const {
-    return *ptr < val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  bool operator < (volatile const_value_type& val) const {
-    return *ptr < val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  bool operator > (const_value_type& val) const {
-    return *ptr > val;
-  }
-  KOKKOS_INLINE_FUNCTION
-  bool operator > (volatile const_value_type& val) const {
-    return *ptr > val;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  operator const_value_type () const {
-    //return Kokkos::atomic_load(ptr);
-    return *ptr;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  operator volatile non_const_value_type () volatile const {
-    //return Kokkos::atomic_load(ptr);
-    return *ptr;
-  }
-};
-
-template<class ViewTraits>
-class AtomicViewDataHandle {
-public:
-  typename ViewTraits::value_type* ptr;
-
-  KOKKOS_INLINE_FUNCTION
-  AtomicViewDataHandle(typename ViewTraits::value_type* ptr_):ptr(ptr_){}
-
-  template<class iType>
-  KOKKOS_INLINE_FUNCTION
-  AtomicDataElement<ViewTraits> operator[] (const iType& i) const {
-    return AtomicDataElement<ViewTraits>(ptr+i,AtomicViewConstTag());
-  }
-
-
-  KOKKOS_INLINE_FUNCTION
-  operator typename ViewTraits::value_type * () const { return ptr ; }
-
-};
-
-template<unsigned Size>
-struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars;
-
-template<>
-struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<4> {
-  typedef int type;
-};
-
-template<>
-struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
-  typedef int64_t type;
-};
-
-// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
-template<class ViewTraits>
-class ViewDataHandle<
-  ViewTraits ,
-  typename enable_if<
-    ( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
-    ( ViewTraits::memory_traits::Atomic )
-  >::type >
-{
-private:
-//  typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) || 
-//                        (sizeof(typename ViewTraits::const_value_type)==8), 
-//                         int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type 
-//                   atomic_view_possible; 
-  typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
-  typedef ViewDataHandle self_type;
-
-public:
-  enum {  ReturnTypeIsReference = false };
-
-  typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type;
-  typedef Impl::AtomicDataElement<ViewTraits>    return_type;
-};
-
-}
-}
-
-#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -1,441 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Error.hpp>
-#include <cctype>
-#include <cstring>
-#include <iostream>
-#include <cstdlib>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-bool is_unsigned_int(const char* str)
-{
-  const size_t len = strlen (str);
-  for (size_t i = 0; i < len; ++i) {
-    if (! isdigit (str[i])) {
-      return false;
-    }
-  }
-  return true;
-}
-
-void initialize_internal(const InitArguments& args)
-{
-  // Protect declarations, to prevent "unused variable" warnings.
-#if defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
-  const int num_threads = args.num_threads;
-  const int use_numa = args.num_numa;
-#endif // defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
-#if defined( KOKKOS_HAVE_CUDA )
-  const int use_gpu = args.device_id;
-#endif // defined( KOKKOS_HAVE_CUDA )
-
-#if defined( KOKKOS_HAVE_OPENMP )
-  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
-      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
-    if(num_threads>0) {
-      if(use_numa>0) {
-        Kokkos::OpenMP::initialize(num_threads,use_numa);
-      }
-      else {
-        Kokkos::OpenMP::initialize(num_threads);
-      }
-    } else {
-      Kokkos::OpenMP::initialize();
-    }
-    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
-  }
-  else {
-    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
-      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
-    if(num_threads>0) {
-      if(use_numa>0) {
-        Kokkos::Threads::initialize(num_threads,use_numa);
-      }
-      else {
-        Kokkos::Threads::initialize(num_threads);
-      }
-    } else {
-      Kokkos::Threads::initialize();
-    }
-    //std::cout << "Kokkos::initialize() fyi: Pthread enabled and initialized" << std::endl ;
-  }
-  else {
-    //std::cout << "Kokkos::initialize() fyi: Pthread enabled but not initialized" << std::endl ;
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_SERIAL )
-  // Prevent "unused variable" warning for 'args' input struct.  If
-  // Serial::initialize() ever needs to take arguments from the input
-  // struct, you may remove this line of code.
-  (void) args;
-
-  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
-      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
-    Kokkos::Serial::initialize();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_CUDA )
-  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
-    if (use_gpu > -1) {
-      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( use_gpu ) );
-    }
-    else {
-      Kokkos::Cuda::initialize();
-    }
-    //std::cout << "Kokkos::initialize() fyi: Cuda enabled and initialized" << std::endl ;
-  }
-#endif
-}
-
-void finalize_internal( const bool all_spaces = false )
-{
-
-#if defined( KOKKOS_HAVE_CUDA )
-  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
-    if(Kokkos::Cuda::is_initialized())
-      Kokkos::Cuda::finalize();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_OPENMP )
-  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
-      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ||
-      all_spaces ) {
-    if(Kokkos::OpenMP::is_initialized())
-      Kokkos::OpenMP::finalize();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
-      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ||
-      all_spaces ) {
-    if(Kokkos::Threads::is_initialized())
-      Kokkos::Threads::finalize();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_SERIAL )
-  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
-      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ||
-      all_spaces ) {
-    if(Kokkos::Serial::is_initialized())
-      Kokkos::Serial::finalize();
-  }
-#endif
-
-}
-
-void fence_internal()
-{
-
-#if defined( KOKKOS_HAVE_CUDA )
-  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value ) {
-    Kokkos::Cuda::fence();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_OPENMP )
-  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
-      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
-    Kokkos::OpenMP::fence();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
-      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
-    Kokkos::Threads::fence();
-  }
-#endif
-
-#if defined( KOKKOS_HAVE_SERIAL )
-  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
-      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
-    Kokkos::Serial::fence();
-  }
-#endif
-
-}
-
-} // namespace
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-void initialize(int& narg, char* arg[])
-{
-    int num_threads = -1;
-    int numa = -1;
-    int device = -1;
-
-    int kokkos_threads_found = 0;
-    int kokkos_numa_found = 0;
-    int kokkos_device_found = 0;
-    int kokkos_ndevices_found = 0;
-
-    int iarg = 0;
-
-    while (iarg < narg) {
-      if ((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || (strncmp(arg[iarg],"--threads",9) == 0)) {
-        //Find the number of threads (expecting --threads=XX)
-        if (!((strncmp(arg[iarg],"--kokkos-threads=",17) == 0) || (strncmp(arg[iarg],"--threads=",10) == 0)))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        char* number =  strchr(arg[iarg],'=')+1;
-
-        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        if((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || !kokkos_threads_found)
-          num_threads = atoi(number);
-
-        //Remove the --kokkos-threads argument from the list but leave --threads
-        if(strncmp(arg[iarg],"--kokkos-threads",16) == 0) {
-          for(int k=iarg;k<narg-1;k++) {
-            arg[k] = arg[k+1];
-          }
-          kokkos_threads_found=1;
-          narg--;
-        } else {
-          iarg++;
-        }
-      } else if ((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || (strncmp(arg[iarg],"--numa",6) == 0)) {
-        //Find the number of numa (expecting --numa=XX)
-        if (!((strncmp(arg[iarg],"--kokkos-numa=",14) == 0) || (strncmp(arg[iarg],"--numa=",7) == 0)))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        char* number =  strchr(arg[iarg],'=')+1;
-
-        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        if((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || !kokkos_numa_found)
-          numa = atoi(number);
-
-        //Remove the --kokkos-numa argument from the list but leave --numa
-        if(strncmp(arg[iarg],"--kokkos-numa",13) == 0) {
-          for(int k=iarg;k<narg-1;k++) {
-            arg[k] = arg[k+1];
-          }
-          kokkos_numa_found=1;
-          narg--;
-        } else {
-          iarg++;
-        }
-      } else if ((strncmp(arg[iarg],"--kokkos-device",15) == 0) || (strncmp(arg[iarg],"--device",8) == 0)) {
-        //Find the number of device (expecting --device=XX)
-        if (!((strncmp(arg[iarg],"--kokkos-device=",16) == 0) || (strncmp(arg[iarg],"--device=",9) == 0)))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        char* number =  strchr(arg[iarg],'=')+1;
-
-        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
-          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        if((strncmp(arg[iarg],"--kokkos-device",15) == 0) || !kokkos_device_found)
-          device = atoi(number);
-
-        //Remove the --kokkos-device argument from the list but leave --device
-        if(strncmp(arg[iarg],"--kokkos-device",15) == 0) {
-          for(int k=iarg;k<narg-1;k++) {
-            arg[k] = arg[k+1];
-          }
-          kokkos_device_found=1;
-          narg--;
-        } else {
-          iarg++;
-        }
-      } else if ((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || (strncmp(arg[iarg],"--ndevices",10) == 0)) {
-
-        //Find the number of device (expecting --device=XX)
-        if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0)))
-          Impl::throw_runtime_exception("Error: expecting an '=INT[,INT]' after command line argument '--ndevices/--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-        int ndevices=-1;
-        int skip_device = 9999;
-
-        char* num1 = strchr(arg[iarg],'=')+1;
-        char* num2 = strpbrk(num1,",");
-        int num1_len = num2==NULL?strlen(num1):num2-num1;
-        char* num1_only = new char[num1_len+1];
-        strncpy(num1_only,num1,num1_len);
-        num1_only[num1_len]=0;
-
-        if(!Impl::is_unsigned_int(num1_only) || (strlen(num1_only)==0)) {
-          Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-        }
-        if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
-          ndevices = atoi(num1_only);
-
-        if( num2 != NULL ) {
-          if(( !Impl::is_unsigned_int(num2+1) ) || (strlen(num2)==1) )
-            Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices=XX,'. Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-          if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
-            skip_device = atoi(num2+1);
-        }
-
-        if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found) {
-          char *str;
-          if ((str = getenv("SLURM_LOCALID"))) {
-            int local_rank = atoi(str);
-            device = local_rank % ndevices;
-            if (device >= skip_device) device++;
-          }
-          if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
-            int local_rank = atoi(str);
-            device = local_rank % ndevices;
-            if (device >= skip_device) device++;
-          }
-          if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
-            int local_rank = atoi(str);
-            device = local_rank % ndevices;
-            if (device >= skip_device) device++;
-          }
-          if(device==-1) {
-            device = 0;
-            if (device >= skip_device) device++;
-          }
-        }
-
-        //Remove the --kokkos-ndevices argument from the list but leave --ndevices
-        if(strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) {
-          for(int k=iarg;k<narg-1;k++) {
-            arg[k] = arg[k+1];
-          }
-          kokkos_ndevices_found=1;
-          narg--;
-        } else {
-          iarg++;
-        }
-      } else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
-         std::cout << std::endl;
-         std::cout << "--------------------------------------------------------------------------------" << std::endl;
-         std::cout << "-------------Kokkos command line arguments--------------------------------------" << std::endl;
-         std::cout << "--------------------------------------------------------------------------------" << std::endl;
-         std::cout << "The following arguments exist also without prefix 'kokkos' (e.g. --help)." << std::endl;
-         std::cout << "The prefixed arguments will be removed from the list by Kokkos::initialize()," << std::endl;
-         std::cout << "the non-prefixed ones are not removed. Prefixed versions take precedence over " << std::endl;
-         std::cout << "non prefixed ones, and the last occurence of an argument overwrites prior" << std::endl;
-         std::cout << "settings." << std::endl;
-         std::cout << std::endl;
-         std::cout << "--kokkos-help               : print this message" << std::endl;
-         std::cout << "--kokkos-threads=INT        : specify total number of threads or" << std::endl;
-         std::cout << "                              number of threads per NUMA region if " << std::endl;
-         std::cout << "                              used in conjunction with '--numa' option. " << std::endl;
-         std::cout << "--kokkos-numa=INT           : specify number of NUMA regions used by process." << std::endl;
-         std::cout << "--kokkos-device=INT         : specify device id to be used by Kokkos. " << std::endl;
-         std::cout << "--kokkos-ndevices=INT[,INT] : used when running MPI jobs. Specify number of" << std::endl;
-         std::cout << "                              devices per node to be used. Process to device" << std::endl;
-         std::cout << "                              mapping happens by obtaining the local MPI rank" << std::endl;
-         std::cout << "                              and assigning devices round-robin. The optional" << std::endl;
-         std::cout << "                              second argument allows for an existing device" << std::endl;
-         std::cout << "                              to be ignored. This is most useful on workstations" << std::endl;
-         std::cout << "                              with multiple GPUs of which one is used to drive" << std::endl;
-         std::cout << "                              screen output." << std::endl;
-         std::cout << std::endl;
-         std::cout << "--------------------------------------------------------------------------------" << std::endl;
-         std::cout << std::endl;
-
-         //Remove the --kokkos-help argument from the list but leave --ndevices
-         if(strcmp(arg[iarg],"--kokkos-help") == 0) {
-           for(int k=iarg;k<narg-1;k++) {
-             arg[k] = arg[k+1];
-           }
-           narg--;
-         } else {
-           iarg++;
-         }
-      } else
-      iarg++;
-    }
-
-    InitArguments arguments;
-    arguments.num_threads = num_threads;
-    arguments.num_numa = numa;
-    arguments.device_id = device;
-    Impl::initialize_internal(arguments);
-}
-
-void initialize(const InitArguments& arguments) {
-  Impl::initialize_internal(arguments);
-}
-
-void finalize()
-{
-  Impl::finalize_internal();
-}
-
-void finalize_all()
-{
-  enum { all_spaces = true };
-  Impl::finalize_internal( all_spaces );
-}
-
-void fence()
-{
-  Impl::fence_internal();
-}
-
-} // namespace Kokkos
-
--- a/lib/kokkos/core/src/impl/Kokkos_CrsArray_factory.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_CrsArray_factory.hpp
@ -1,223 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP
-#define KOKKOS_IMPL_CRSARRAY_FACTORY_HPP
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
-inline
-typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view )
-{
-  // Force copy:
-  //typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
-  typedef CrsArray< DataType , Arg1Type , Arg2Type , SizeType >  crsarray_type ;
-
-  typename crsarray_type::HostMirror               tmp ;
-  typename crsarray_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map );
-
-  tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const'
-  tmp.entries = create_mirror( view.entries );
-
-  // Deep copy:
-  deep_copy( tmp_row_map , view.row_map );
-  deep_copy( tmp.entries , view.entries );
-
-  return tmp ;
-}
-
-template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
-inline
-typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view ,
-                    typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
-{
-  return view ;
-}
-
-template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
-inline
-typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view ,
-                    typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
-{
-  return create_mirror( view );
-}
-
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template< class CrsArrayType , class InputSizeType >
-inline
-typename CrsArrayType::crsarray_type
-create_crsarray( const std::string & label ,
-                 const std::vector< InputSizeType > & input )
-{
-  typedef CrsArrayType                  output_type ;
-  //typedef std::vector< InputSizeType >  input_type ; // unused
-
-  typedef typename output_type::entries_type   entries_type ;
-
-  typedef View< typename output_type::size_type [] ,
-                typename output_type::array_layout ,
-                typename output_type::execution_space > work_type ;
-
-  output_type output ;
-
-  // Create the row map:
-
-  const size_t length = input.size();
-
-  {
-    work_type row_work( "tmp" , length + 1 );
-
-    typename work_type::HostMirror row_work_host =
-      create_mirror_view( row_work );
-
-    size_t sum = 0 ;
-    row_work_host[0] = 0 ;
-    for ( size_t i = 0 ; i < length ; ++i ) {
-      row_work_host[i+1] = sum += input[i];
-    }
-
-    deep_copy( row_work , row_work_host );
-
-    output.entries   = entries_type( label , sum );
-    output.row_map   = row_work ;
-  }
-
-  return output ;
-}
-
-//----------------------------------------------------------------------------
-
-template< class CrsArrayType , class InputSizeType >
-inline
-typename CrsArrayType::crsarray_type
-create_crsarray( const std::string & label ,
-                 const std::vector< std::vector< InputSizeType > > & input )
-{
-  typedef CrsArrayType                                output_type ;
-  //typedef std::vector< std::vector< InputSizeType > > input_type ; // unused
-  typedef typename output_type::entries_type          entries_type ;
-  //typedef typename output_type::size_type             size_type ; // unused
-
-  // mfh 14 Feb 2014: This function doesn't actually create instances
-  // of ok_rank, but it needs to declare the typedef in order to do
-  // the static "assert" (a compile-time check that the given shape
-  // has rank 1).  In order to avoid a "declared but unused typedef"
-  // warning, we declare an empty instance of this type, with the
-  // usual "(void)" marker to avoid a compiler warning for the unused
-  // variable.
-
-  typedef typename
-    Impl::assert_shape_is_rank_one< typename entries_type::shape_type >::type
-      ok_rank ;
-  {
-    ok_rank thing;
-    (void) thing;
-  }
-
-  typedef View< typename output_type::size_type [] ,
-                typename output_type::array_layout ,
-                typename output_type::execution_space > work_type ;
-
-  output_type output ;
-
-    // Create the row map:
-
-  const size_t length = input.size();
-
-  {
-    work_type row_work( "tmp" , length + 1 );
-
-    typename work_type::HostMirror row_work_host =
-      create_mirror_view( row_work );
-
-    size_t sum = 0 ;
-    row_work_host[0] = 0 ;
-    for ( size_t i = 0 ; i < length ; ++i ) {
-      row_work_host[i+1] = sum += input[i].size();
-    }
-
-    deep_copy( row_work , row_work_host );
-
-    output.entries   = entries_type( label , sum );
-    output.row_map   = row_work ;
-  }
-
-  // Fill in the entries:
-  {
-    typename entries_type::HostMirror host_entries =
-      create_mirror_view( output.entries );
-
-    size_t sum = 0 ;
-    for ( size_t i = 0 ; i < length ; ++i ) {
-      for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) {
-        host_entries( sum ) = input[i][j] ;
-      }
-    }
-
-    deep_copy( output.entries , host_entries );
-  }
-
-  return output ;
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */
-
--- a/lib/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
@ -1,195 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include <ostream>
-#include <sstream>
-#include <iomanip>
-#include <stdexcept>
-#include <impl/Kokkos_Error.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-void host_abort( const char * const message )
-{
-  fwrite(message,1,strlen(message),stderr);
-  fflush(stderr);
-  abort();
-}
-
-void throw_runtime_exception( const std::string & msg )
-{
-  std::ostringstream o ;
-  o << msg ;
-  traceback_callstack( o );
-  throw std::runtime_error( o.str() );
-}
-
-
-std::string human_memory_size(size_t arg_bytes)
-{
-  double bytes = arg_bytes;
-  const double K = 1024;
-  const double M = K*1024;
-  const double G = M*1024;
-
-  std::ostringstream out;
-  if (bytes < K) {
-    out << std::setprecision(4) << bytes << " B";
-  } else if (bytes < M) {
-    bytes /= K;
-    out << std::setprecision(4) << bytes << " K";
-  } else if (bytes < G) {
-    bytes /= M;
-    out << std::setprecision(4) << bytes << " M";
-  } else {
-    bytes /= G;
-    out << std::setprecision(4) << bytes << " G";
-  }
-  return out.str();
-}
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __GNUC__ ) && defined( ENABLE_TRACEBACK )
-
-/*  This is only known to work with GNU C++
- *  Must be compiled with '-rdynamic'
- *  Must be linked with   '-ldl'
- */
-
-/* Print call stack into an error stream,
- * so one knows in which function the error occured.
- *
- * Code copied from:
- *   http://stupefydeveloper.blogspot.com/2008/10/cc-call-stack.html
- *
- * License on this site:
- *   This blog is licensed under a
- *   Creative Commons Attribution-Share Alike 3.0 Unported License.
- *
- *   http://creativecommons.org/licenses/by-sa/3.0/
- *
- * Modified to output to std::ostream.
- */
-#include <signal.h>
-#include <execinfo.h>
-#include <cxxabi.h>
-#include <dlfcn.h>
-#include <stdlib.h>
-
-namespace Kokkos {
-namespace Impl {
-
-void traceback_callstack( std::ostream & msg )
-{
-  using namespace abi;
-
-  enum { MAX_DEPTH = 32 };
-
-  void *trace[MAX_DEPTH];
-  Dl_info dlinfo;
-
-  int status;
-
-  int trace_size = backtrace(trace, MAX_DEPTH);
-
-  msg << std::endl << "Call stack {" << std::endl ;
-
-  for (int i=1; i<trace_size; ++i)
-  {
-    if(!dladdr(trace[i], &dlinfo))
-        continue;
-
-    const char * symname = dlinfo.dli_sname;
-
-    char * demangled = __cxa_demangle(symname, NULL, 0, &status);
-
-    if ( status == 0 && demangled ) {
-      symname = demangled;
-    }
-
-    if ( symname && *symname != 0 ) {
-      msg << "  object: " << dlinfo.dli_fname
-          << " function: " << symname
-          << std::endl ;
-    }
-
-    if ( demangled ) {
-        free(demangled);
-    }
-  }
-  msg << "}" ;
-}
-
-}
-}
-
-#else
-
-namespace Kokkos {
-namespace Impl {
-
-void traceback_callstack( std::ostream & msg )
-{
-  msg << std::endl << "Traceback functionality not available" << std::endl ;
-}
-
-}
-}
-
-#endif
-
--- a/lib/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
@ -1,80 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_IMPL_ERROR_HPP
-#define KOKKOS_IMPL_ERROR_HPP
-
-#include <string>
-#include <iosfwd>
-
-namespace Kokkos {
-namespace Impl {
-
-void host_abort( const char * const );
-
-void throw_runtime_exception( const std::string & );
-
-void traceback_callstack( std::ostream & );
-
-std::string human_memory_size(size_t arg_bytes);
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-namespace Kokkos {
-inline
-void abort( const char * const message ) { Kokkos::Impl::host_abort(message); }
-}
-#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */
-
--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@ -1,960 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_FUNCTORADAPTER_HPP
-#define KOKKOS_FUNCTORADAPTER_HPP
-
-#include <cstddef>
-#include <Kokkos_Core_fwd.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class ArgTag , class Enable = void >
-struct FunctorDeclaresValueType : public Impl::false_type {};
-
-template< class FunctorType , class ArgTag >
-struct FunctorDeclaresValueType< FunctorType , ArgTag
-                               , typename Impl::enable_if_type< typename FunctorType::value_type >::type >
-  : public Impl::true_type {};
-
-
-/** \brief  Query Functor and execution policy argument tag for value type.
- *
- *  If C++11 enabled and 'value_type' is not explicitly declared then attempt
- *  to deduce the type from FunctorType::operator().
- */
-template< class FunctorType , class ArgTag , bool Dec = FunctorDeclaresValueType<FunctorType,ArgTag>::value >
-struct FunctorValueTraits
-{
-  typedef void value_type ;
-  typedef void pointer_type ;
-  typedef void reference_type ;
-
-  enum { StaticValueSize = 0 };
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  unsigned value_count( const FunctorType & ) { return 0 ; }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  unsigned value_size( const FunctorType & ) { return 0 ; }
-};
-
-/** \brief  FunctorType::value_type is explicitly declared so use it.
- *
- * Two options for declaration
- *
- *   1) A plain-old-data (POD) type
- *        typedef {pod_type} value_type ;
- *
- *   2) An array of POD of a runtime specified count.
- *        typedef {pod_type} value_type[] ;
- *        const unsigned     value_count ;
- */
-template< class FunctorType , class ArgTag >
-struct FunctorValueTraits< FunctorType , ArgTag , true /* exists FunctorType::value_type */ >
-{
-  typedef typename Impl::remove_extent< typename FunctorType::value_type >::type  value_type ;
-
-  // If not an array then what is the sizeof(value_type)
-  enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) };
-
-  typedef value_type                 * pointer_type ;
-
-  // The reference_type for an array is 'value_type *'
-  // The reference_type for a single value is 'value_type &'
-
-  typedef typename Impl::if_c< ! StaticValueSize , value_type *
-                                                 , value_type & >::type  reference_type ;
-
-  // Number of values if single value
-  template< class F >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  typename Impl::enable_if< Impl::is_same<F,FunctorType>::value && StaticValueSize , unsigned >::type
-    value_count( const F & ) { return 1 ; }
-
-  // Number of values if an array, protect via templating because 'f.value_count'
-  // will only exist when the functor declares the value_type to be an array.
-  template< class F >
-  KOKKOS_FORCEINLINE_FUNCTION static
-  typename Impl::enable_if< Impl::is_same<F,FunctorType>::value && ! StaticValueSize , unsigned >::type
-    value_count( const F & f ) { return f.value_count ; }
-
-  // Total size of the value
-  KOKKOS_INLINE_FUNCTION static
-  unsigned value_size( const FunctorType & f ) { return value_count( f ) * sizeof(value_type) ; }
-};
-
-
-#if defined( KOKKOS_HAVE_CXX11 )
-
-// If have C++11 and functor does not explicitly specify a value type
-// then try to deduce the value type from FunctorType::operator().
-// Can only deduce single value type since array length cannot be deduced.
-template< class FunctorType >
-struct FunctorValueTraits< FunctorType
-                       , void  /* == ArgTag */
-                       , false /* == exists FunctorType::value_type */
-                       >
-{
-private:
-
-  struct VOID {};
-
-  // parallel_for operator without a tag:
-  template< class ArgMember >
-  KOKKOS_INLINE_FUNCTION
-  static VOID deduce( void (FunctorType::*)( ArgMember ) const ) {}
-
-  // parallel_reduce operator without a tag:
-  template< class ArgMember , class T >
-  KOKKOS_INLINE_FUNCTION
-  static T deduce( void (FunctorType::*)( ArgMember , T & ) const ) {}
-
-  // parallel_scan operator without a tag:
-  template< class ArgMember , class T >
-  KOKKOS_INLINE_FUNCTION
-  static T deduce( void (FunctorType::*)( ArgMember , T & , bool ) const ) {}
-
-  typedef decltype( deduce( & FunctorType::operator() ) ) ValueType ;
-
-  enum { IS_VOID = Impl::is_same<VOID,ValueType>::value };
-
-public:
-
-  typedef typename Impl::if_c< IS_VOID , void , ValueType   >::type  value_type ;
-  typedef typename Impl::if_c< IS_VOID , void , ValueType * >::type  pointer_type ;
-  typedef typename Impl::if_c< IS_VOID , void , ValueType & >::type  reference_type ;
-
-  enum { StaticValueSize = IS_VOID ? 0 : sizeof(ValueType) };
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  unsigned value_size( const FunctorType & ) { return StaticValueSize ; }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  unsigned value_count( const FunctorType & ) { return IS_VOID ? 0 : 1 ; }
-};
-
-
-template< class FunctorType , class ArgTag >
-struct FunctorValueTraits< FunctorType
-                       , ArgTag /* != void */
-                       , false  /* == exists FunctorType::value_type */
-                       >
-{
-private:
-
-  //----------------------------------------
-  // parallel_for operator with a tag:
-
-  struct VOID {}; // to allow valid sizeof(ValueType)
-
-  template< class ArgMember >
-  KOKKOS_INLINE_FUNCTION
-  static VOID deduce( void (FunctorType::*)( ArgTag , ArgMember ) const ) {}
-
-  template< class ArgMember >
-  KOKKOS_INLINE_FUNCTION
-  static VOID deduce( void (FunctorType::*)( const ArgTag & , ArgMember ) const ) {}
-
-  //----------------------------------------
-  // parallel_reduce operator with a tag:
-
-  template< class ArgMember , class T >
-  KOKKOS_INLINE_FUNCTION
-  static T deduce( void (FunctorType::*)( ArgTag , ArgMember , T & ) const ) {}
-
-  template< class ArgMember , class T >
-  KOKKOS_INLINE_FUNCTION
-  static T deduce( void (FunctorType::*)( const ArgTag & , ArgMember , T & ) const ) {}
-
-  //----------------------------------------
-  // parallel_scan operator with a tag:
-
-  template< class ArgMember , class T >
-  KOKKOS_INLINE_FUNCTION
-  static T deduce( void (FunctorType::*)( ArgTag , ArgMember , T & , bool ) const ) {}
-
-  template< class ArgMember , class T >
-  KOKKOS_INLINE_FUNCTION
-  static T deduce( void (FunctorType::*)( const ArgTag & , ArgMember , T & , bool ) const ) {}
-
-  //----------------------------------------
-
-  typedef decltype( deduce( & FunctorType::operator() ) ) ValueType ;
-
-  enum { IS_VOID = Impl::is_same<VOID,ValueType>::value };
-
-public:
-
-  typedef typename Impl::if_c< IS_VOID , void , ValueType   >::type  value_type ;
-  typedef typename Impl::if_c< IS_VOID , void , ValueType * >::type  pointer_type ;
-  typedef typename Impl::if_c< IS_VOID , void , ValueType & >::type  reference_type ;
-
-  enum { StaticValueSize = IS_VOID ? 0 : sizeof(ValueType) };
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  unsigned value_size( const FunctorType & ) { return StaticValueSize ; }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  unsigned value_count( const FunctorType & ) { return IS_VOID ? 0 : 1 ; }
-};
-
-#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-// Function signatures for FunctorType::init function with a tag and not an array
-template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize >
-struct FunctorValueInitFunction {
-
-  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type & ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type & ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type & ) );
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile & ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile & ) );
-};
-
-// Function signatures for FunctorType::init function with a tag and is an array
-template< class FunctorType , class ArgTag >
-struct FunctorValueInitFunction< FunctorType , ArgTag , true > {
-
-  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type * ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type * ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type * ) );
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile * ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile * ) );
-};
-
-// Function signatures for FunctorType::init function without a tag and not an array
-template< class FunctorType >
-struct FunctorValueInitFunction< FunctorType , void , false > {
-
-  typedef typename FunctorValueTraits<FunctorType,void>::reference_type value_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type & ) );
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type volatile & ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type volatile & ) );
-};
-
-// Function signatures for FunctorType::init function without a tag and is an array
-template< class FunctorType >
-struct FunctorValueInitFunction< FunctorType , void , true > {
-
-  typedef typename FunctorValueTraits<FunctorType,void>::reference_type value_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type * ) );
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type volatile * ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type volatile * ) );
-};
-
-// Adapter for value initialization function.
-// If a proper FunctorType::init is declared then use it,
-// otherwise use default constructor.
-template< class FunctorType , class ArgTag
-        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
-        , class Enable = void >
-struct FunctorValueInit ;
-
-/* No 'init' function provided for single value */
-template< class FunctorType , class ArgTag , class T , class Enable >
-struct FunctorValueInit< FunctorType , ArgTag , T & , Enable >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  T & init( const FunctorType & f , void * p )
-    { return *( new(p) T() ); };
-};
-
-/* No 'init' function provided for array value */
-template< class FunctorType , class ArgTag , class T , class Enable >
-struct FunctorValueInit< FunctorType , ArgTag , T * , Enable >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  T * init( const FunctorType & f , void * p )
-    {
-      const int n = FunctorValueTraits< FunctorType , ArgTag >::value_count(f);
-      for ( int i = 0 ; i < n ; ++i ) { new( ((T*)p) + i ) T(); }
-      return (T*)p ;
-    }
-};
-
-/* 'init' function provided for single value */
-template< class FunctorType , class ArgTag , class T >
-struct FunctorValueInit
-  < FunctorType
-  , ArgTag
-  , T &
-    // First  substitution failure when FunctorType::init does not exist.
-#if defined( KOKKOS_HAVE_CXX11 )
-    // Second substitution failure when FunctorType::init is not compatible.
-  , decltype( FunctorValueInitFunction< FunctorType , ArgTag >::enable_if( & FunctorType::init ) )
-#else
-  , typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type
-#endif
-  >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  T & init( const FunctorType & f , void * p )
-    { f.init( *((T*)p) ); return *((T*)p) ; }
-};
-
-/* 'init' function provided for array value */
-template< class FunctorType , class ArgTag , class T >
-struct FunctorValueInit
-  < FunctorType
-  , ArgTag
-  , T *
-    // First  substitution failure when FunctorType::init does not exist.
-#if defined( KOKKOS_HAVE_CXX11 )
-    // Second substitution failure when FunctorType::init is not compatible
-  , decltype( FunctorValueInitFunction< FunctorType , ArgTag >::enable_if( & FunctorType::init ) )
-#else
-  , typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type
-#endif
-  >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  T * init( const FunctorType & f , void * p )
-    { f.init( (T*)p ); return (T*)p ; }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-// Signatures for compatible FunctorType::join with tag and not an array
-template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize >
-struct FunctorValueJoinFunction {
-
-  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
-
-  typedef       volatile value_type & vref_type ;
-  typedef const volatile value_type & cvref_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , vref_type , cvref_type ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , vref_type , cvref_type ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , vref_type , cvref_type ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , vref_type , cvref_type ) );
-};
-
-// Signatures for compatible FunctorType::join with tag and is an array
-template< class FunctorType , class ArgTag >
-struct FunctorValueJoinFunction< FunctorType , ArgTag , true > {
-
-  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
-
-  typedef       volatile value_type * vptr_type ;
-  typedef const volatile value_type * cvptr_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , vptr_type , cvptr_type ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , vptr_type , cvptr_type ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , vptr_type , cvptr_type ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , vptr_type , cvptr_type ) );
-};
-
-// Signatures for compatible FunctorType::join without tag and not an array
-template< class FunctorType >
-struct FunctorValueJoinFunction< FunctorType , void , false > {
-
-  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
-
-  typedef       volatile value_type & vref_type ;
-  typedef const volatile value_type & cvref_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( vref_type , cvref_type ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( vref_type , cvref_type ) );
-};
-
-// Signatures for compatible FunctorType::join without tag and is an array
-template< class FunctorType >
-struct FunctorValueJoinFunction< FunctorType , void , true > {
-
-  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
-
-  typedef       volatile value_type * vptr_type ;
-  typedef const volatile value_type * cvptr_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( vptr_type , cvptr_type ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( vptr_type , cvptr_type ) );
-};
-
-
-template< class FunctorType , class ArgTag
-        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
-        , class Enable = void >
-struct FunctorValueJoin ;
-
-/* No 'join' function provided, single value */
-template< class FunctorType , class ArgTag , class T , class Enable >
-struct FunctorValueJoin< FunctorType , ArgTag , T & , Enable >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
-    {
-      *((volatile T*)lhs) += *((const volatile T*)rhs);
-    }
-};
-
-/* No 'join' function provided, array of values */
-template< class FunctorType , class ArgTag , class T , class Enable >
-struct FunctorValueJoin< FunctorType , ArgTag , T * , Enable >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
-    {
-      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
-
-      for ( int i = 0 ; i < n ; ++i ) { ((volatile T*)lhs)[i] += ((const volatile T*)rhs)[i]; }
-    }
-};
-
-/* 'join' function provided, single value */
-template< class FunctorType , class ArgTag , class T >
-struct FunctorValueJoin
-  < FunctorType
-  , ArgTag
-  , T &
-    // First  substitution failure when FunctorType::join does not exist.
-#if defined( KOKKOS_HAVE_CXX11 )
-    // Second substitution failure when enable_if( & Functor::join ) does not exist
-  , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) )
-#else
-  , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type
-#endif
-  >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
-    {
-      f.join( ArgTag() , *((volatile T *)lhs) , *((const volatile T *)rhs) );
-    }
-};
-
-/* 'join' function provided, no tag, single value */
-template< class FunctorType , class T >
-struct FunctorValueJoin
-  < FunctorType
-  , void
-  , T &
-    // First  substitution failure when FunctorType::join does not exist.
-#if defined( KOKKOS_HAVE_CXX11 )
-    // Second substitution failure when enable_if( & Functor::join ) does not exist
-  , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) )
-#else
-  , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type
-#endif
-  >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
-    {
-      f.join( *((volatile T *)lhs) , *((const volatile T *)rhs) );
-    }
-};
-
-/* 'join' function provided for array value */
-template< class FunctorType , class ArgTag , class T >
-struct FunctorValueJoin
-  < FunctorType
-  , ArgTag
-  , T *
-    // First  substitution failure when FunctorType::join does not exist.
-#if defined( KOKKOS_HAVE_CXX11 )
-    // Second substitution failure when enable_if( & Functor::join ) does not exist
-  , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) )
-#else
-  , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type
-#endif
-  >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
-    {
-      f.join( ArgTag() , (volatile T *)lhs , (const volatile T *)rhs );
-    }
-};
-
-/* 'join' function provided, no tag, array value */
-template< class FunctorType , class T >
-struct FunctorValueJoin
-  < FunctorType
-  , void
-  , T *
-    // First  substitution failure when FunctorType::join does not exist.
-#if defined( KOKKOS_HAVE_CXX11 )
-    // Second substitution failure when enable_if( & Functor::join ) does not exist
-  , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) )
-#else
-  , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type
-#endif
-  >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
-    {
-      f.join( (volatile T *)lhs , (const volatile T *)rhs );
-    }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#ifdef KOKKOS_HAVE_CXX11
-namespace Kokkos {
-
-namespace Impl {
-
-  template<typename ValueType, class JoinOp, class Enable = void>
-  struct JoinLambdaAdapter {
-    typedef ValueType value_type;
-    const JoinOp& lambda;
-    KOKKOS_INLINE_FUNCTION
-    JoinLambdaAdapter(const JoinOp& lambda_):lambda(lambda_) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void join(volatile value_type& dst, const volatile value_type& src) const {
-      lambda(dst,src);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void join(value_type& dst, const value_type& src) const {
-      lambda(dst,src);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (volatile value_type& dst, const volatile value_type& src) const {
-      lambda(dst,src);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (value_type& dst, const value_type& src) const {
-      lambda(dst,src);
-    }
-  };
-
-  template<typename ValueType, class JoinOp>
-  struct JoinLambdaAdapter<ValueType, JoinOp, decltype( FunctorValueJoinFunction< JoinOp , void >::enable_if( & JoinOp::join ) )> {
-    typedef ValueType value_type;
-    typedef StaticAssertSame<ValueType,typename JoinOp::value_type> assert_value_types_match;
-    const JoinOp& lambda;
-    KOKKOS_INLINE_FUNCTION
-    JoinLambdaAdapter(const JoinOp& lambda_):lambda(lambda_) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void join(volatile value_type& dst, const volatile value_type& src) const {
-      lambda.join(dst,src);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void join(value_type& dst, const value_type& src) const {
-      lambda.join(dst,src);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (volatile value_type& dst, const volatile value_type& src) const {
-      lambda.join(dst,src);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (value_type& dst, const value_type& src) const {
-      lambda.join(dst,src);
-    }
-  };
-
-  template<typename ValueType>
-  struct JoinAdd {
-    typedef ValueType value_type;
-
-    KOKKOS_INLINE_FUNCTION
-    JoinAdd() {}
-
-    KOKKOS_INLINE_FUNCTION
-    void join(volatile value_type& dst, const volatile value_type& src) const {
-      dst+=src;
-    }
-    KOKKOS_INLINE_FUNCTION
-    void operator() (value_type& dst, const value_type& src) const {
-      dst+=src;
-    }
-    KOKKOS_INLINE_FUNCTION
-    void operator() (volatile value_type& dst, const volatile value_type& src) const {
-      dst+=src;
-    }
-  };
-
-}
-}
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class ArgTag
-        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type >
-struct FunctorValueOps ;
-
-template< class FunctorType , class ArgTag , class T >
-struct FunctorValueOps< FunctorType , ArgTag , T & >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  T * pointer( T & r ) { return & r ; }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  T & reference( void * p ) { return *((T*)p); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void copy( const FunctorType & , void * const lhs , const void * const rhs )
-    { *((T*)lhs) = *((const T*)rhs); }
-};
-
-/* No 'join' function provided, array of values */
-template< class FunctorType , class ArgTag , class T >
-struct FunctorValueOps< FunctorType , ArgTag , T * >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  T * pointer( T * p ) { return p ; }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  T * reference( void * p ) { return ((T*)p); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void copy( const FunctorType & f , void * const lhs , const void * const rhs )
-    {
-      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
-      for ( int i = 0 ; i < n ; ++i ) { ((T*)lhs)[i] = ((const T*)rhs)[i]; }
-    }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-// Compatible functions for 'final' function and value_type not an array
-template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize >
-struct FunctorFinalFunction {
-
-  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type & ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type & ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type & ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type & ) );
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile & ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile & ) );
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const & ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const & ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const & ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const & ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const & ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const & ) );
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile & ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile & ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const volatile & ) );
-};
-
-// Compatible functions for 'final' function and value_type is an array
-template< class FunctorType , class ArgTag >
-struct FunctorFinalFunction< FunctorType , ArgTag , true > {
-
-  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type * ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type * ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type * ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type * ) );
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile * ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile * ) );
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const * ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const * ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const * ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const * ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const * ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const * ) );
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile * ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile * ) const );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const volatile * ) );
-};
-
-template< class FunctorType >
-struct FunctorFinalFunction< FunctorType , void , false > {
-
-  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type & ) );
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type & ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type & ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( const value_type & ) );
-};
-
-template< class FunctorType >
-struct FunctorFinalFunction< FunctorType , void , true > {
-
-  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type * ) );
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type * ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type * ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( const value_type * ) );
-};
-
-/* No 'final' function provided */
-template< class FunctorType , class ArgTag
-        , class ResultType = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
-        , class Enable = void >
-struct FunctorFinal
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void final( const FunctorType & , void * ) {}
-};
-
-/* 'final' function provided */
-template< class FunctorType , class ArgTag , class T >
-struct FunctorFinal
-  < FunctorType
-  , ArgTag
-  , T &
-    // First  substitution failure when FunctorType::final does not exist.
-#if defined( KOKKOS_HAVE_CXX11 )
-    // Second substitution failure when enable_if( & Functor::final ) does not exist
-  , decltype( FunctorFinalFunction< FunctorType , ArgTag >::enable_if( & FunctorType::final ) )
-#else
-  , typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type
-#endif
-  >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void final( const FunctorType & f , void * p ) { f.final( *((T*)p) ); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void final( FunctorType & f , void * p ) { f.final( *((T*)p) ); }
-};
-
-/* 'final' function provided for array value */
-template< class FunctorType , class ArgTag , class T >
-struct FunctorFinal
-  < FunctorType
-  , ArgTag
-  , T *
-    // First  substitution failure when FunctorType::final does not exist.
-#if defined( KOKKOS_HAVE_CXX11 )
-    // Second substitution failure when enable_if( & Functor::final ) does not exist
-  , decltype( FunctorFinalFunction< FunctorType , ArgTag >::enable_if( & FunctorType::final ) )
-#else
-  , typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type
-#endif
-  >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void final( const FunctorType & f , void * p ) { f.final( (T*)p ); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void final( FunctorType & f , void * p ) { f.final( (T*)p ); }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class ArgTag
-        , class ReferenceType = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type >
-struct FunctorApplyFunction {
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , ReferenceType ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , ReferenceType ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , ReferenceType ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , ReferenceType ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , ReferenceType ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , ReferenceType ) );
-};
-
-template< class FunctorType , class ReferenceType >
-struct FunctorApplyFunction< FunctorType , void , ReferenceType > {
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ReferenceType ) const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ReferenceType ) );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ReferenceType ) );
-};
-
-template< class FunctorType >
-struct FunctorApplyFunction< FunctorType , void , void > {
-
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)() const );
-  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)() );
-};
-
-template< class FunctorType , class ArgTag , class ReferenceType
-        , class Enable = void >
-struct FunctorApply
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void apply( const FunctorType & , void * ) {}
-};
-
-/* 'apply' function provided for void value */
-template< class FunctorType , class ArgTag >
-struct FunctorApply
-  < FunctorType
-  , ArgTag
-  , void
-    // First  substitution failure when FunctorType::apply does not exist.
-#if defined( KOKKOS_HAVE_CXX11 )
-    // Second substitution failure when enable_if( & Functor::apply ) does not exist
-  , decltype( FunctorApplyFunction< FunctorType , ArgTag , void >::enable_if( & FunctorType::apply ) )
-#else
-  , typename Impl::enable_if< 0 < sizeof( & FunctorType::apply ) >::type
-#endif
-  >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void apply( FunctorType & f ) { f.apply(); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void apply( const FunctorType & f ) { f.apply(); }
-};
-
-/* 'apply' function provided for single value */
-template< class FunctorType , class ArgTag , class T >
-struct FunctorApply
-  < FunctorType
-  , ArgTag
-  , T &
-    // First  substitution failure when FunctorType::apply does not exist.
-#if defined( KOKKOS_HAVE_CXX11 )
-    // Second substitution failure when enable_if( & Functor::apply ) does not exist
-  , decltype( FunctorApplyFunction< FunctorType , ArgTag >::enable_if( & FunctorType::apply ) )
-#else
-  , typename Impl::enable_if< 0 < sizeof( & FunctorType::apply ) >::type
-#endif
-  >
-{
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void apply( const FunctorType & f , void * p ) { f.apply( *((T*)p) ); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static
-  void apply( FunctorType & f , void * p ) { f.apply( *((T*)p) ); }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* KOKKOS_FUNCTORADAPTER_HPP */
-
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@ -1,271 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <memory.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <iostream>
-#include <sstream>
-#include <cstring>
-
-#include <Kokkos_HostSpace.hpp>
-#include <impl/Kokkos_MemoryTracking.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-Impl::MemoryTracking<> & host_space_singleton()
-{
-  static Impl::MemoryTracking<> self("Kokkos::HostSpace");
-  return self ;
-}
-
-} // namespace <blank>
-} // namespace Impl
-} // namespade Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-void * host_allocate_not_thread_safe( const std::string & label , const size_t size )
-{
-  void * ptr = 0 ;
-
-  if ( size ) {
-    size_t size_padded = size ;
-    void * ptr_alloc = 0 ;
-
-#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
-
-    ptr = ptr_alloc = _mm_malloc( size , MEMORY_ALIGNMENT );
-
-#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
-      ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
-
-    posix_memalign( & ptr_alloc , MEMORY_ALIGNMENT , size );
-    ptr = ptr_alloc ;
-
-#else
-
-    {
-      // Over-allocate to and round up to guarantee proper alignment.
-
-      size_padded = ( size + MEMORY_ALIGNMENT - 1 );
-
-      ptr_alloc = malloc( size_padded );
-
-      const size_t rem = reinterpret_cast<ptrdiff_t>(ptr_alloc) % MEMORY_ALIGNMENT ;
-
-      ptr = static_cast<unsigned char *>(ptr_alloc) + ( rem ? MEMORY_ALIGNMENT - rem : 0 );
-    }
-
-#endif
-
-    if ( ptr_alloc && ptr_alloc <= ptr &&
-         0 == ( reinterpret_cast<ptrdiff_t>(ptr) % MEMORY_ALIGNMENT ) ) {
-      // Insert allocated pointer and allocation count
-      Impl::host_space_singleton().insert( label , ptr_alloc , size_padded );
-    }
-    else {
-      std::ostringstream msg ;
-      msg << "Kokkos::Impl::host_allocate_not_thread_safe( "
-          << label
-          << " , " << size
-          << " ) FAILED aligned memory allocation" ;
-      Kokkos::Impl::throw_runtime_exception( msg.str() );
-    }
-  }
-
-  return ptr ;
-}
-
-void host_decrement_not_thread_safe( const void * ptr )
-{
-  void * ptr_alloc = Impl::host_space_singleton().decrement( ptr );
-
-  if ( ptr_alloc ) {
-#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
-     _mm_free( ptr_alloc );
-#else
-     free( ptr_alloc );
-#endif
-  }
-}
-
-DeepCopy<HostSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
-{
-  memcpy( dst , src , n );
-}
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace {
-
-static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
-
-typedef int (* QuerySpaceInParallelPtr )();
-
-QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
-int s_in_parallel_query_count = 0 ;
-
-} // namespace <empty>
-
-void HostSpace::register_in_parallel( int (*device_in_parallel)() )
-{
-  if ( 0 == device_in_parallel ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
-  }
-
-  int i = -1 ;
-
-  if ( ! (device_in_parallel)() ) {
-    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
-  }
-
-  if ( i < s_in_parallel_query_count ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
-
-  }
-
-  if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
-
-  }
-
-  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
-
-  if ( i == s_in_parallel_query_count ) {
-    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
-  }
-}
-
-int HostSpace::in_parallel()
-{
-  const int n = s_in_parallel_query_count ;
-
-  int i = 0 ;
-
-  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
-
-  return i < n ;
-}
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-void * HostSpace::allocate( const std::string & label , const size_t size )
-{
-  void * ptr = 0 ;
-
-  if ( ! HostSpace::in_parallel() ) {
-    ptr = Impl::host_allocate_not_thread_safe( label , size );
-  }
-  else {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::allocate called within a parallel functor") );
-  }
-
-  return ptr ;
-}
-
-void HostSpace::increment( const void * ptr )
-{
-  if ( ! HostSpace::in_parallel() ) {
-    Impl::host_space_singleton().increment( ptr );
-  }
-  else {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::increment called within a parallel functor") );
-  }
-}
-
-void HostSpace::decrement( const void * ptr )
-{
-  if ( ! HostSpace::in_parallel() ) {
-    Impl::host_decrement_not_thread_safe( ptr );
-  }
-  else {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::decrement called within a parallel functor") );
-  }
-}
-
-int HostSpace::count( const void * ptr ) {
-  if ( ! HostSpace::in_parallel() ) {
-    Impl::MemoryTracking<>::Entry * const entry =
-        Impl::host_space_singleton().query(ptr);
-    return entry != NULL?entry->count():0;
-  }
-  else {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::count called within a parallel functor") );
-    return -1;
-  }
-}
-
-void HostSpace::print_memory_view( std::ostream & o )
-{
-  Impl::host_space_singleton().print( o , std::string("  ") );
-}
-
-std::string HostSpace::query_label( const void * p )
-{
-  Impl::MemoryTracking<>::Entry * const entry = Impl::host_space_singleton().query(p);
-  return std::string( entry ? entry->label() : "<NOT ALLOCATED>" );
-}
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
--- a/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.hpp
@ -1,374 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_MEMORY_TRACKING_HPP
-#define KOKKOS_MEMORY_TRACKING_HPP
-
-#include <cstddef>
-#include <cstring>
-#include <limits>
-#include <utility>
-#include <vector>
-#include <string>
-#include <sstream>
-#include <iostream>
-
-#include <impl/Kokkos_Error.hpp>
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-// Fast search for result[-1] <= val < result[0].
-// Requires result[max] == upper_bound.
-// Start with a binary search until the search range is
-// less than LINEAR_LIMIT, then switch to linear search.
-
-int memory_tracking_upper_bound( const ptrdiff_t * const begin
-                               , unsigned length
-                               , const ptrdiff_t value )
-{
-  enum { LINEAR_LIMIT = 32 };
-
-  // precondition: begin[length-1] == std::numeric_limits<ptrdiff_t>::max()
-
-  const ptrdiff_t * first = begin ;
-
-  while ( LINEAR_LIMIT < length ) {
-    unsigned          half   = length >> 1 ;
-    const ptrdiff_t * middle = first + half ;
-
-    if ( value < *middle ) {
-      length = half ;
-    }
-    else {
-      first   = ++middle ;
-      length -= ++half ;
-    }
-  }
-
-  for ( ; ! ( value < *first ) ; ++first ) {}
-
-  return first - begin ;
-}
-
-template< class AttributeType = size_t >
-class MemoryTracking {
-public:
-
-  class Entry {
-  private:
-
-    friend class MemoryTracking ;
-
-    enum { LABEL_LENGTH = 128 };
-
-    Entry( const Entry & );
-    Entry & operator = ( const Entry & );
-
-    ~Entry() {}
-
-    Entry()
-     : m_count(0)
-     , m_alloc_ptr( reinterpret_cast<void*>( std::numeric_limits<ptrdiff_t>::max() ) )
-     , m_alloc_size(0)
-     , m_attribute()
-     { strcpy( m_label , "sentinel" ); }
-
-    Entry( const std::string & arg_label 
-         , void * const        arg_alloc_ptr 
-         , size_t const        arg_alloc_size )
-      : m_count( 0 )
-      , m_alloc_ptr( arg_alloc_ptr )
-      , m_alloc_size( arg_alloc_size )
-      , m_attribute()
-      {
-        strncpy( m_label , arg_label.c_str() , LABEL_LENGTH );
-        m_label[ LABEL_LENGTH - 1 ] = 0 ;
-      }
-
-    char    m_label[ LABEL_LENGTH ] ;
-    size_t  m_count ;
-
-  public:
-
-    void * const   m_alloc_ptr ;
-    size_t const   m_alloc_size ;
-    AttributeType  m_attribute ;
-
-    size_t       count() const { return m_count ; }
-    const char * label() const { return m_label ; }
-
-    void print( std::ostream & oss ) const
-      {
-        oss << "{ \"" << m_label
-            << "\" count(" << m_count
-            << ") memory[ " << m_alloc_ptr
-            << " + " << m_alloc_size
-            << " ]" ;
-      }
-  };
-
-  //------------------------------------------------------------
-  /** \brief  Track a memory range defined by the entry.
-   *          Return the input entry pointer for success.
-   *          Throw exception for failure.
-   */
-  Entry * insert( const std::string & arg_label
-                , void * const  arg_alloc_ptr
-                , size_t const  arg_alloc_size
-                )
-    {
-      Entry * result = 0 ;
-
-      const ptrdiff_t alloc_begin = reinterpret_cast<ptrdiff_t>(arg_alloc_ptr);
-      const ptrdiff_t alloc_end   = alloc_begin + arg_alloc_size ;
-
-      const bool ok_exist = ! m_tracking_end.empty(); 
-
-      const bool ok_input =
-        ok_exist &&
-        ( 0 < alloc_begin ) &&
-            ( alloc_begin < alloc_end ) &&
-                          ( alloc_end < std::numeric_limits<ptrdiff_t>::max() ); 
-
-      const int i = ok_input
-                  ? memory_tracking_upper_bound( & m_tracking_end[0] , m_tracking_end.size() , alloc_end )
-                  : -1 ;
-
-      const bool ok_range = ( 0 <= i ) && ( alloc_end <= reinterpret_cast<ptrdiff_t>( m_tracking[i]->m_alloc_ptr ) );
-
-      // allocate the new entry only if the vector inserts succeed.
-      const bool ok_insert =
-        ok_range &&
-        ( alloc_end == *m_tracking_end.insert(m_tracking_end.begin()+i,alloc_end) ) &&
-        ( 0 == *m_tracking.insert(m_tracking.begin()+i,0) ) &&
-        ( 0 != ( result = new Entry(arg_label,arg_alloc_ptr,arg_alloc_size) ) );
-
-      if ( ok_insert ) {
-        result->m_count = 1 ;
-        m_tracking[i] = result ;
-      }
-      else {
-        std::ostringstream msg ;
-        msg << m_space
-            << "::insert( " << arg_label
-            << " , " << arg_alloc_ptr
-            << " , " << arg_alloc_size
-            << " ) ERROR : " ;
-        if ( ! ok_exist ) {
-          msg << " called after return from main()" ;
-        }
-        else if ( ! ok_input ) {
-          msg << " bad allocation range" ;
-        }
-        else if ( ! ok_range ) {
-          msg << " overlapping memory range with"
-              << " { " << m_tracking[i]->m_label
-              << " , " << m_tracking[i]->m_alloc_ptr
-              << " , " << m_tracking[i]->m_alloc_size
-              << " }" ;
-        }
-        else {
-          msg << " internal allocation error" ;
-        }
-        Kokkos::Impl::throw_runtime_exception( msg.str() );
-      }
-
-      return result ;
-    }
-
-  /** \brief  Decrement the tracked memory range.
-   *          If the count is zero then return the originally inserted pointer.
-   *          If the count is non zero then return zero.
-   */
-  void * decrement( void const * const ptr )
-    {
-      void * result = 0 ;
-
-      if ( ptr ) {
-        const bool ok_exist = ! m_tracking_end.empty();
-
-        const int i = ok_exist
-                    ? memory_tracking_upper_bound( & m_tracking_end[0] , m_tracking_end.size() , reinterpret_cast<ptrdiff_t>(ptr) )
-                    : -1 ;
-
-        const bool ok_found = ( 0 <= i ) && ( reinterpret_cast<ptrdiff_t>( m_tracking[i]->m_alloc_ptr ) <=
-                                              reinterpret_cast<ptrdiff_t>(ptr) );
-
-        if ( ok_found ) {
-          if ( 0 == --( m_tracking[i]->m_count ) ) {
-            result = m_tracking[i]->m_alloc_ptr ;          
-            delete m_tracking[i] ;
-            m_tracking.erase(     m_tracking.begin() + i );
-            m_tracking_end.erase( m_tracking_end.begin() + i );
-          }
-        }
-        else {
-          // Don't throw as this is likely called from within a destructor.
-          std::cerr << m_space
-                    << "::decrement( " << ptr << " ) ERROR : " 
-                    << ( ! ok_exist ? " called after return from main()" 
-                                    : " memory not being tracked" )
-                    << std::endl ;
-          std::cerr.flush();
-        }
-      }
-      return result ;
-    }
-
-  /** \brief  Increment the tracking count.  */
-  void increment( void const * const ptr )
-    {
-      if ( ptr ) {
-        const bool ok_exist = ! m_tracking_end.empty();
-
-        const int i = ok_exist
-                    ? memory_tracking_upper_bound( & m_tracking_end[0] , m_tracking_end.size() , reinterpret_cast<ptrdiff_t>(ptr) )
-                    : -1 ;
-
-        const bool ok_found = ( 0 <= i ) && ( reinterpret_cast<ptrdiff_t>( m_tracking[i]->m_alloc_ptr ) <=
-                                              reinterpret_cast<ptrdiff_t>(ptr) );
-
-        if ( ok_found ) {
-          ++( m_tracking[i]->m_count );
-        }
-        else {
-          std::ostringstream msg ;
-          msg << m_space
-              << "::increment( " << ptr << " ) ERROR : "
-              << ( ! ok_exist ? " called after return from main()" 
-                              : " memory not being tracked" )
-              << std::endl ;
-          Kokkos::Impl::throw_runtime_exception( msg.str() );
-        }
-      }
-    }
-
-  /** \brief  Query a tracked memory range.
-   *          Return zero for not found.
-   */
-  Entry * query( void const * const ptr ) const
-    {
-      const bool ok_exist = ! m_tracking_end.empty();
-
-      const int i = ( ok_exist && ptr )
-                  ? memory_tracking_upper_bound( & m_tracking_end[0] , m_tracking_end.size() , reinterpret_cast<ptrdiff_t>(ptr) )
-                  : -1 ;
-
-      const bool ok_found = ( 0 <= i ) && ( reinterpret_cast<ptrdiff_t>( m_tracking[i]->m_alloc_ptr ) <=
-                                            reinterpret_cast<ptrdiff_t>(ptr) );
-
-      return ok_found ? m_tracking[i] : (Entry *) 0 ;
-    }
-
-  /** \brief  Call the 'print' method on all entries. */
-  void print( std::ostream & oss , const std::string & lead ) const
-    {
-      const size_t n = m_tracking.empty() ? 0 : m_tracking.size() - 1 ;
-      for ( size_t i = 0 ; i < n ; ++i ) {
-        oss << lead ;
-        m_tracking[i]->print( oss );
-        oss << std::endl ;
-      }
-    }
-
-  size_t size() const { return m_tracking.size(); }
-
-  template< typename iType >
-  MemoryTracking & operator[]( const iType & i ) const
-    { return *m_tracking[i]; }
-
-  /** \brief Construct with a name for error messages */
-  explicit MemoryTracking( const std::string & space_name )
-    : m_space( space_name )
-    , m_tracking()
-    , m_tracking_end()
-    , m_sentinel()
-    {
-      m_tracking.reserve( 512 );
-      m_tracking_end.reserve( 512 );
-      m_tracking.push_back( & m_sentinel );
-      m_tracking_end.push_back( reinterpret_cast<ptrdiff_t>( m_sentinel.m_alloc_ptr ) );
-    }
-
-  /** \brief  Print memory leak warning for all entries. */
-  ~MemoryTracking()
-    {
-      try {
-        const ptrdiff_t max = std::numeric_limits<ptrdiff_t>::max();
-
-        if ( 1 < m_tracking.size() ) {
-          std::cerr << m_space << " destroyed with memory leaks:" ;
-          print( std::cerr , std::string("  ") );
-        }
-        else if ( m_tracking.empty() || max != m_tracking_end.back() ) {
-          std::cerr << m_space << " corrupted data structure" << std::endl ;
-        }
-
-        m_space = std::string();
-        m_tracking = std::vector<Entry*>();
-        m_tracking_end = std::vector<ptrdiff_t>();
-      }
-      catch( ... ) {}
-    }
-
-  const std::string & label() const { return m_space ; }
-
-private:
-  MemoryTracking();
-  MemoryTracking( const MemoryTracking & );
-  MemoryTracking & operator = ( const MemoryTracking & );
-
-  std::string             m_space ;
-  std::vector<Entry*>     m_tracking ;
-  std::vector<ptrdiff_t>  m_tracking_end ;
-  Entry                   m_sentinel ;
-};
-
-} /* namespace */
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#endif
-
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@ -1,73 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE )
-#define KOKKOS_MEMORY_FENCE
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-KOKKOS_FORCEINLINE_FUNCTION
-void memory_fence()
-{
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-  __threadfence();
-#elif defined( KOKKOS_ATOMICS_USE_GCC ) || \
-      ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ATOMICS_USE_INTEL ) )
-  __sync_synchronize();
-#elif defined( KOKKOS_ATOMICS_USE_INTEL )
-  _mm_mfence();
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-  #pragma omp flush
-
-#else
- #error "Error: memory_fence() not defined"
-#endif
-}
-
-} // namespace kokkos
-
-#endif
-
-
--- a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
@ -1,84 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
-#define KOKKOS_PHYSICAL_LAYOUT_HPP
-
-
-#include <Kokkos_View.hpp>
-namespace Kokkos {
-namespace Impl {
-
-
-
-struct PhysicalLayout {
-  enum LayoutType {Left,Right,Scalar,Error};
-  LayoutType layout_type;
-  int rank;
-  long long int stride[8]; //distance between two neighboring elements in a given dimension
-
-  template< class T , class L , class D , class M >
-  PhysicalLayout( const View<T,L,D,M,ViewDefault> & view )
-    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
-                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
-    , rank( view.Rank )
-    {
-      for(int i=0;i<8;i++) stride[i] = 0;
-      view.stride( stride );
-    }
-  #ifdef KOKKOS_HAVE_CUDA
-  template< class T , class L , class D , class M >
-  PhysicalLayout( const View<T,L,D,M,ViewCudaTexture> & view )
-    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
-                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
-    , rank( view.Rank )
-    {
-      for(int i=0;i<8;i++) stride[i] = 0;
-      view.stride( stride );
-    }
-  #endif
-};
-
-}
-}
-#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@ -1,119 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <stdlib.h>
-#include <sstream>
-#include <Kokkos_Serial.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-#if defined( KOKKOS_HAVE_SERIAL )
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-namespace SerialImpl {
-
-Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
-
-Sentinel::~Sentinel()
-{
-  if ( m_scratch ) { free( m_scratch ); }
-  m_scratch = 0 ;
-  m_reduce_end = 0 ;
-  m_shared_end = 0 ;
-}
-
-Sentinel & Sentinel::singleton()
-{
-  static Sentinel s ; return s ;
-}
-
-inline
-unsigned align( unsigned n )
-{
-  enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
-  return ( n + MASK ) & ~MASK ;
-}
-
-} // namespace
-
-SerialTeamMember::SerialTeamMember( int arg_league_rank
-                                  , int arg_league_size
-                                  , int arg_shared_size
-                                  )
-  : m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
-           , arg_shared_size )
-  , m_league_rank( arg_league_rank )
-  , m_league_size( arg_league_size )
-{}
-
-} // namespace Impl
-
-void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
-{
-  static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
-
-  reduce_size = Impl::SerialImpl::align( reduce_size );
-  shared_size = Impl::SerialImpl::align( shared_size );
-
-  if ( ( s.m_reduce_end < reduce_size ) ||
-       ( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
-
-    if ( s.m_scratch ) { free( s.m_scratch ); }
-
-    if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
-    if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
-
-    s.m_scratch = malloc( s.m_shared_end );
-  }
-
-  return s.m_scratch ;
-}
-
-} // namespace Kokkos
-
-#endif // defined( KOKKOS_HAVE_SERIAL )
-
-
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
@ -1,324 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-// Experimental unified task-data parallel manycore LDRD
-
-#include <impl/Kokkos_Serial_TaskPolicy.hpp>
-
-#if defined( KOKKOS_HAVE_SERIAL )
-#include <stdlib.h>
-#include <stdexcept>
-#include <iostream>
-#include <sstream>
-#include <string>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-typedef TaskMember<  Kokkos::Serial , void , void > Task ;
-
-//----------------------------------------------------------------------------
-
-namespace {
-
-inline
-unsigned padded_sizeof_derived( unsigned sizeof_derived )
-{
-  return sizeof_derived +
-    ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
-}
-
-} // namespace
-
-void Task::deallocate( void * ptr )
-{
-  free( ptr );
-}
-
-void * Task::allocate( const unsigned arg_sizeof_derived
-                     , const unsigned arg_dependence_capacity )
-{
-  return malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
-}
-
-Task::~TaskMember()
-{
-
-}
-
-Task::TaskMember( const Task::function_verify_type   arg_verify
-                , const Task::function_dealloc_type  arg_dealloc
-                , const Task::function_apply_type    arg_apply
-                , const unsigned                     arg_sizeof_derived
-                , const unsigned                     arg_dependence_capacity
-                )
-  : m_dealloc( arg_dealloc )
-  , m_verify(  arg_verify )
-  , m_apply(   arg_apply )
-  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
-  , m_wait( 0 )
-  , m_next( 0 )
-  , m_dep_capacity( arg_dependence_capacity )
-  , m_dep_size( 0 )
-  , m_ref_count( 0 )
-  , m_state( TASK_STATE_CONSTRUCTING )
-{
-  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
-}
-
-Task::TaskMember( const Task::function_dealloc_type  arg_dealloc
-                , const Task::function_apply_type    arg_apply
-                , const unsigned                     arg_sizeof_derived
-                , const unsigned                     arg_dependence_capacity
-                )
-  : m_dealloc( arg_dealloc )
-  , m_verify(  & Task::verify_type<void> )
-  , m_apply(   arg_apply )
-  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
-  , m_wait( 0 )
-  , m_next( 0 )
-  , m_dep_capacity( arg_dependence_capacity )
-  , m_dep_size( 0 )
-  , m_ref_count( 0 )
-  , m_state( TASK_STATE_CONSTRUCTING )
-{
-  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
-}
-
-//----------------------------------------------------------------------------
-
-void Task::throw_error_add_dependence() const
-{
-  std::cerr << "TaskMember< Serial >::add_dependence ERROR"
-            << " state(" << m_state << ")"
-            << " dep_size(" << m_dep_size << ")"
-            << std::endl ;
-  throw std::runtime_error("TaskMember< Serial >::add_dependence ERROR");
-}
-
-void Task::throw_error_verify_type()
-{
-  throw std::runtime_error("TaskMember< Serial >::verify_type ERROR");
-}
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-
-void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
-{
-  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Serial>::assign ERROR" ;
-  static const char msg_error_count[]       = ": negative reference count" ;
-  static const char msg_error_complete[]    = ": destroy task that is not complete" ;
-  static const char msg_error_dependences[] = ": destroy task that has dependences" ;
-  static const char msg_error_exception[]   = ": caught internal exception" ;
-
-  const char * msg_error = 0 ;
-
-  try {
-
-    if ( *lhs ) {
-
-      const int count = --((**lhs).m_ref_count);
-
-      if ( 0 == count ) {
-
-        // Reference count at zero, delete it
-
-        // Should only be deallocating a completed task
-        if ( (**lhs).m_state == Kokkos::TASK_STATE_COMPLETE ) {
-
-          // A completed task should not have dependences...
-          for ( int i = 0 ; i < (**lhs).m_dep_size && 0 == msg_error ; ++i ) {
-            if ( (**lhs).m_dep[i] ) msg_error = msg_error_dependences ;
-          }
-        }
-        else {
-          msg_error = msg_error_complete ;
-        }
-
-        if ( 0 == msg_error ) {
-          // Get deletion function and apply it
-          const Task::function_dealloc_type d = (**lhs).m_dealloc ;
-
-          (*d)( *lhs );
-        }
-      }
-      else if ( count <= 0 ) {
-        msg_error = msg_error_count ;
-      }
-    }
-
-    if ( 0 == msg_error && rhs ) { ++( rhs->m_ref_count ); }
-
-    *lhs = rhs ;
-  }
-  catch( ... ) {
-    if ( 0 == msg_error ) msg_error = msg_error_exception ;
-  }
-
-  if ( 0 != msg_error ) {
-    if ( no_throw ) {
-      std::cerr << msg_error_header << msg_error << std::endl ;
-      std::cerr.flush();
-    }
-    else {
-      std::string msg(msg_error_header);
-      msg.append(msg_error);
-      throw std::runtime_error( msg );
-    }
-  }
-}
-#endif
-
-namespace {
-
-Task * s_ready = 0 ;
-Task * s_denied = reinterpret_cast<Task*>( ~((unsigned long)0) );
-
-}
-
-void Task::schedule()
-{
-  // Execute ready tasks in case the task being scheduled
-  // is dependent upon a waiting and ready task.
-
-  Task::execute_ready_tasks();
-
-  // spawning   : Constructing -> Waiting
-  // respawning : Executing    -> Waiting
-  // updating   : Waiting      -> Waiting
-
-  // Must not be in a dependence linked list:  0 == t->m_next
-
-  const bool ok_state = TASK_STATE_COMPLETE != m_state ;
-  const bool ok_list  = 0 == m_next ;
-
-  if ( ok_state && ok_list ) {
-
-    // Will be waiting for execution upon return from this function
-
-    m_state = Kokkos::TASK_STATE_WAITING ;
-
-    // Insert this task into another dependence that is not complete
-
-    int i = 0 ;
-    for ( ; i < m_dep_size ; ++i ) {
-      Task * const y = m_dep[i] ;
-      if ( y && s_denied != ( m_next = y->m_wait ) ) {
-        y->m_wait = this ; // CAS( & y->m_wait , m_next , this );
-        break ;
-      }
-    }
-    if ( i == m_dep_size ) {
-      // All dependences are complete, insert into the ready list
-      m_next  = s_ready ;
-      s_ready = this ; // CAS( & s_ready , m_next = s_ready , this );
-    }
-  }
-  else {
-    throw std::runtime_error(std::string("Kokkos::Impl::Task spawn or respawn state error"));
-  }
-}
-
-void Task::execute_ready_tasks()
-{
-  while ( s_ready ) {
-
-    // Remove this task from the ready list
-
-    // Task * task ;
-    // while ( ! CAS( & s_ready , task = s_ready , s_ready->m_next ) );
-
-    Task * const task = s_ready ;
-    s_ready = task->m_next ;
-
-    task->m_next = 0 ;
-
-    // precondition: task->m_state = TASK_STATE_WAITING
-    // precondition: task->m_dep[i]->m_state == TASK_STATE_COMPLETE  for all i
-    // precondition: does not exist T such that T->m_wait = task
-    // precondition: does not exist T such that T->m_next = task
-
-    task->m_state = Kokkos::TASK_STATE_EXECUTING ;
-
-    (*task->m_apply)( task );
-
-    if ( task->m_state == Kokkos::TASK_STATE_EXECUTING ) {
-      // task did not respawn itself
-      task->m_state = Kokkos::TASK_STATE_COMPLETE ;
-
-      // release dependences:
-      for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
-        assign( task->m_dep + i , 0 );
-      }
-
-      // Stop other tasks from adding themselves to 'task->m_wait' ;
-
-      Task * x ;
-      // CAS( & task->m_wait , x = task->m_wait , s_denied );
-      x = task->m_wait ; task->m_wait = s_denied ;
-
-      // update tasks waiting on this task
-      while ( x ) {
-        Task * const next = x->m_next ;
-
-        x->m_next = 0 ;
-
-        x->schedule(); // could happen concurrently
-
-        x = next ;
-      }
-    }
-  }
-}
-
-void Task::wait( const Future< void , Kokkos::Serial > & )
-{ execute_ready_tasks(); }
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif // defined( KOKKOS_HAVE_SERIAL )
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
@ -1,763 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-// Experimental unified task-data parallel manycore LDRD
-
-#ifndef KOKKOS_SERIAL_TASKPOLICY_HPP
-#define KOKKOS_SERIAL_TASKPOLICY_HPP
-
-#include <Kokkos_Macros.hpp>
-#if defined( KOKKOS_HAVE_SERIAL )
-
-#include <string>
-#include <typeinfo>
-#include <stdexcept>
-
-#include <Kokkos_Serial.hpp>
-#include <Kokkos_TaskPolicy.hpp>
-#include <Kokkos_View.hpp>
-
-#include <impl/Kokkos_FunctorAdapter.hpp>
-
-//----------------------------------------------------------------------------
-/*  Inheritance structure to allow static_cast from the task root type
- *  and a task's FunctorType.
- *
- *    task_root_type == TaskMember< Space , void , void >
- *
- *    TaskMember< PolicyType , ResultType , FunctorType >
- *      : TaskMember< PolicyType::Space , ResultType , FunctorType >
- *      { ... };
- *
- *    TaskMember< Space , ResultType , FunctorType >
- *      : TaskMember< Space , ResultType , void >
- *      , FunctorType
- *      { ... };
- *
- *  when ResultType != void
- *
- *    TaskMember< Space , ResultType , void >
- *      : TaskMember< Space , void , void >
- *      { ... };
- *
- */
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  Base class for all tasks in the Serial execution space */
-template<>
-class TaskMember< Kokkos::Serial , void , void >
-{
-public:
-
-  typedef void         (* function_apply_type)  ( TaskMember * );
-  typedef void         (* function_dealloc_type)( TaskMember * );
-  typedef TaskMember * (* function_verify_type) ( TaskMember * );
-
-private:
-
-  const function_dealloc_type  m_dealloc ; ///< Deallocation
-  const function_verify_type   m_verify ;  ///< Result type verification
-  const function_apply_type    m_apply ;   ///< Apply function
-  TaskMember ** const          m_dep ;     ///< Dependences
-  TaskMember *                 m_wait ;    ///< Linked list of tasks waiting on this task
-  TaskMember *                 m_next ;    ///< Linked list of tasks waiting on a different task
-  const int                    m_dep_capacity ; ///< Capacity of dependences
-  int                          m_dep_size ;     ///< Actual count of dependences
-  int                          m_ref_count ;    ///< Reference count
-  int                          m_state ;        ///< State of the task
-
-  // size = 6 Pointers + 4 ints
-
-  TaskMember() /* = delete */ ;
-  TaskMember( const TaskMember & ) /* = delete */ ;
-  TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
-
-  static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
-  static void   deallocate( void * );
-
-  void throw_error_add_dependence() const ;
-  static void throw_error_verify_type();
-
-  template < class DerivedTaskType >
-  static
-  void deallocate( TaskMember * t )
-    {
-      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
-      ptr->~DerivedTaskType();
-      deallocate( (void *) ptr );
-    }
-
-protected :
-
-  ~TaskMember();
-
-  // Used by TaskMember< Serial , ResultType , void >
-  TaskMember( const function_verify_type   arg_verify
-            , const function_dealloc_type  arg_dealloc
-            , const function_apply_type    arg_apply
-            , const unsigned               arg_sizeof_derived
-            , const unsigned               arg_dependence_capacity
-            );
-
-  // Used for TaskMember< Serial , void , void >
-  TaskMember( const function_dealloc_type  arg_dealloc
-            , const function_apply_type    arg_apply
-            , const unsigned               arg_sizeof_derived
-            , const unsigned               arg_dependence_capacity
-            );
-
-public:
-
-  template< typename ResultType >
-  KOKKOS_FUNCTION static
-  TaskMember * verify_type( TaskMember * t )
-    {
-      enum { check_type = ! Impl::is_same< ResultType , void >::value };
-
-      if ( check_type && t != 0 ) {
-
-        // Verify that t->m_verify is this function
-        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
-
-        if ( t->m_verify != self ) {
-          t = 0 ;
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-          throw_error_verify_type();
-#endif
-        }
-      }
-      return t ;
-    }
-
-  //----------------------------------------
-  /*  Inheritence Requirements on task types:
-   *    typedef  FunctorType::value_type  value_type ;
-   *    class DerivedTaskType
-   *      : public TaskMember< Serial , value_type , FunctorType >
-   *      { ... };
-   *    class TaskMember< Serial , value_type , FunctorType >
-   *      : public TaskMember< Serial , value_type , void >
-   *      , public Functor
-   *      { ... };
-   *  If value_type != void
-   *    class TaskMember< Serial , value_type , void >
-   *      : public TaskMember< Serial , void , void >
-   *
-   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
-   *
-   */
-
-  /** \brief  Allocate and construct a single-thread task */
-  template< class DerivedTaskType >
-  static
-  TaskMember * create( const typename DerivedTaskType::functor_type &  arg_functor
-                     , const unsigned                                  arg_dependence_capacity )
-    {
-      typedef typename DerivedTaskType::functor_type  functor_type ;
-      typedef typename functor_type::value_type       value_type ;
-
-      DerivedTaskType * const task =
-        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
-          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
-                         , & TaskMember::template apply_single< functor_type , value_type >
-                         , sizeof(DerivedTaskType)
-                         , arg_dependence_capacity
-                         , arg_functor );
-
-      return static_cast< TaskMember * >( task );
-    }
-
-  /** \brief  Allocate and construct a data parallel task */
-  template< class DerivedTaskType >
-  static
-  TaskMember * create( const typename DerivedTaskType::policy_type &   arg_policy
-                     , const typename DerivedTaskType::functor_type &  arg_functor
-                     , const unsigned                                  arg_dependence_capacity )
-    {
-      DerivedTaskType * const task =
-        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
-          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
-                         , sizeof(DerivedTaskType)
-                         , arg_dependence_capacity
-                         , arg_policy
-                         , arg_functor
-                         );
-
-      return static_cast< TaskMember * >( task );
-    }
-
-  void schedule();
-  static void execute_ready_tasks();
-  static void wait( const Future< void , Kokkos::Serial > & );
-
-  //----------------------------------------
-
-  typedef FutureValueTypeIsVoidError get_result_type ;
-
-  KOKKOS_INLINE_FUNCTION
-  get_result_type get() const { return get_result_type() ; }
-
-  KOKKOS_INLINE_FUNCTION
-  Kokkos::TaskState get_state() const { return Kokkos::TaskState( m_state ); }
-
-  //----------------------------------------
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-  static
-  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
-#else
-  KOKKOS_INLINE_FUNCTION static
-  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
-#endif
-
-  KOKKOS_INLINE_FUNCTION
-  TaskMember * get_dependence( int i ) const
-    { return ( Kokkos::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  int get_dependence() const
-    { return m_dep_size ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void clear_dependence()
-    {
-      for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
-      m_dep_size = 0 ;
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void add_dependence( TaskMember * before )
-    {
-      if ( ( Kokkos::TASK_STATE_CONSTRUCTING == m_state ||
-             Kokkos::TASK_STATE_EXECUTING    == m_state ) &&
-           m_dep_size < m_dep_capacity ) {
-        assign( m_dep + m_dep_size , before );
-        ++m_dep_size ;
-      }
-      else {
-        throw_error_add_dependence();
-      }
-    }
-
-  //----------------------------------------
-
-  template< class FunctorType , class ResultType >
-  KOKKOS_INLINE_FUNCTION static
-  void apply_single( typename Impl::enable_if< ! Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
-    {
-      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
-
-      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
-      //   : public TaskMember< Kokkos::Serial , ResultType , void >
-      //   , public FunctorType
-      //   { ... };
-
-      derived_type & m = * static_cast< derived_type * >( t );
-
-      Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
-    }
-
-  template< class FunctorType , class ResultType >
-  KOKKOS_INLINE_FUNCTION static
-  void apply_single( typename Impl::enable_if< Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
-    {
-      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
-
-      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
-      //   : public TaskMember< Kokkos::Serial , ResultType , void >
-      //   , public FunctorType
-      //   { ... };
-
-      derived_type & m = * static_cast< derived_type * >( t );
-
-      Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
-    }
-};
-
-//----------------------------------------------------------------------------
-/** \brief  Base class for tasks with a result value in the Serial execution space.
- *
- *  The FunctorType must be void because this class is accessed by the
- *  Future class for the task and result value.
- *
- *  Must be derived from TaskMember<S,void,void> 'root class' so the Future class
- *  can correctly static_cast from the 'root class' to this class.
- */
-template < class ResultType >
-class TaskMember< Kokkos::Serial , ResultType , void >
-  : public TaskMember< Kokkos::Serial , void , void >
-{
-public:
-
-  ResultType  m_result ;
-
-  typedef const ResultType & get_result_type ;
-
-  KOKKOS_INLINE_FUNCTION
-  get_result_type get() const { return m_result ; }
-
-protected:
-
-  typedef TaskMember< Kokkos::Serial , void , void >  task_root_type ;
-  typedef task_root_type::function_dealloc_type       function_dealloc_type ;
-  typedef task_root_type::function_apply_type         function_apply_type ;
-
-  inline
-  TaskMember( const function_dealloc_type  arg_dealloc
-            , const function_apply_type    arg_apply
-            , const unsigned               arg_sizeof_derived
-            , const unsigned               arg_dependence_capacity
-            )
-    : task_root_type( & task_root_type::template verify_type< ResultType >
-                    , arg_dealloc
-                    , arg_apply
-                    , arg_sizeof_derived
-                    , arg_dependence_capacity )
-    , m_result()
-    {}
-
-};
-
-template< class ResultType , class FunctorType >
-class TaskMember< Kokkos::Serial , ResultType , FunctorType >
-  : public TaskMember< Kokkos::Serial , ResultType , void >
-  , public FunctorType
-{
-public:
-
-  typedef FunctorType  functor_type ;
-
-  typedef TaskMember< Kokkos::Serial , void , void >        task_root_type ;
-  typedef TaskMember< Kokkos::Serial , ResultType , void >  task_base_type ;
-  typedef task_root_type::function_dealloc_type             function_dealloc_type ;
-  typedef task_root_type::function_apply_type               function_apply_type ;
-
-  inline
-  TaskMember( const function_dealloc_type  arg_dealloc
-            , const function_apply_type    arg_apply
-            , const unsigned               arg_sizeof_derived
-            , const unsigned               arg_dependence_capacity
-            , const functor_type &         arg_functor
-            )
-    : task_base_type( arg_dealloc , arg_apply , arg_sizeof_derived , arg_dependence_capacity )
-    , functor_type( arg_functor )
-    {}
-};
-
-//----------------------------------------------------------------------------
-/** \brief  ForEach task in the Serial execution space
- *
- *  Derived from TaskMember< Kokkos::Serial , ResultType , FunctorType >
- *  so that Functor can be cast to task root type without knowing policy.
- */
-template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType >
-class TaskForEach< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
-                 , ResultType
-                 , FunctorType >
-  : TaskMember< Kokkos::Serial , ResultType , FunctorType >
-{
-public:
-
-  typedef FunctorType                                              functor_type ;
-  typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >       policy_type ;
-
-private:
-
-  friend class Kokkos::TaskPolicy< Kokkos::Serial > ;
-  friend class Kokkos::Impl::TaskMember< Kokkos::Serial , void , void > ;
-
-  typedef TaskMember< Kokkos::Serial , void , void >               task_root_type ;
-  typedef TaskMember< Kokkos::Serial , ResultType , FunctorType >  task_base_type ;
-  typedef task_root_type::function_dealloc_type                    function_dealloc_type ;
-
-  policy_type  m_policy ;
-
-  template< class Tag >
-  inline
-  typename Impl::enable_if< Impl::is_same<Tag,void>::value >::type
-    apply_policy() const
-    {
-      const typename policy_type::member_type e = m_policy.end();
-      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
-        functor_type::operator()(i);
-      }
-    }
-
-  template< class Tag >
-  inline
-  typename Impl::enable_if< ! Impl::is_same<Tag,void>::value >::type
-    apply_policy() const
-    {
-      const Tag tag ;
-      const typename policy_type::member_type e = m_policy.end();
-      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
-        functor_type::operator()(tag,i);
-      }
-    }
-
-  static
-  void apply_parallel( task_root_type * t )
-    {
-      static_cast<TaskForEach*>(t)->template apply_policy< typename policy_type::work_tag >();
-
-      task_root_type::template apply_single< functor_type , ResultType >( t );
-    }
-
-  TaskForEach( const function_dealloc_type  arg_dealloc
-             , const int                    arg_sizeof_derived
-             , const int                    arg_dependence_capacity
-             , const policy_type &          arg_policy
-             , const functor_type &         arg_functor
-             )
-    : task_base_type( arg_dealloc
-                    , & apply_parallel
-                    , arg_sizeof_derived
-                    , arg_dependence_capacity
-                    , arg_functor )
-    , m_policy( arg_policy )
-    {}
-
-  TaskForEach() /* = delete */ ;
-  TaskForEach( const TaskForEach & ) /* = delete */ ;
-  TaskForEach & operator = ( const TaskForEach & ) /* = delete */ ;
-};
-
-//----------------------------------------------------------------------------
-/** \brief  Reduce task in the Serial execution space
- *
- *  Derived from TaskMember< Kokkos::Serial , ResultType , FunctorType >
- *  so that Functor can be cast to task root type without knowing policy.
- */
-template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType >
-class TaskReduce< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
-                , ResultType
-                , FunctorType >
-  : TaskMember< Kokkos::Serial , ResultType , FunctorType >
-{
-public:
-
-  typedef FunctorType                                              functor_type ;
-  typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >       policy_type ;
-
-private:
-
-  friend class Kokkos::TaskPolicy< Kokkos::Serial > ;
-  friend class Kokkos::Impl::TaskMember< Kokkos::Serial , void , void > ;
-
-  typedef TaskMember< Kokkos::Serial , void , void >               task_root_type ;
-  typedef TaskMember< Kokkos::Serial , ResultType , FunctorType >  task_base_type ;
-  typedef task_root_type::function_dealloc_type                    function_dealloc_type ;
-
-  policy_type  m_policy ;
-
-  template< class Tag >
-  inline
-  void apply_policy( typename Impl::enable_if< Impl::is_same<Tag,void>::value , ResultType & >::type result ) const
-    {
-      Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result );
-      const typename policy_type::member_type e = m_policy.end();
-      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
-        functor_type::operator()( i, result );
-      }
-    }
-
-  template< class Tag >
-  inline
-  void apply_policy( typename Impl::enable_if< ! Impl::is_same<Tag,void>::value , ResultType & >::type result ) const
-    {
-      Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result );
-      const Tag tag ;
-      const typename policy_type::member_type e = m_policy.end();
-      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
-        functor_type::operator()( tag, i, result );
-      }
-    }
-
-  static
-  void apply_parallel( task_root_type * t )
-    {
-      TaskReduce * const task = static_cast<TaskReduce*>(t);
-
-      task->template apply_policy< typename policy_type::work_tag >( task->task_base_type::m_result );
-
-      task_root_type::template apply_single< functor_type , ResultType >( t );
-    }
-
-  TaskReduce( const function_dealloc_type  arg_dealloc
-            , const int                    arg_sizeof_derived
-            , const int                    arg_dependence_capacity
-            , const policy_type &          arg_policy
-            , const functor_type &         arg_functor
-            )
-    : task_base_type( arg_dealloc
-                    , & apply_parallel
-                    , arg_sizeof_derived
-                    , arg_dependence_capacity
-                    , arg_functor )
-    , m_policy( arg_policy )
-    {}
-
-  TaskReduce() /* = delete */ ;
-  TaskReduce( const TaskReduce & ) /* = delete */ ;
-  TaskReduce & operator = ( const TaskReduce & ) /* = delete */ ;
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template<>
-class TaskPolicy< Kokkos::Serial >
-{
-public:
-
-  typedef Kokkos::Serial execution_space ;
-
-private:
-
-  typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
-
-  TaskPolicy & operator = ( const TaskPolicy & ) /* = delete */ ;
-
-  template< class FunctorType >
-  static inline
-  const task_root_type * get_task_root( const FunctorType * f )
-    {
-      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
-      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
-    }
-
-  template< class FunctorType >
-  static inline
-  task_root_type * get_task_root( FunctorType * f )
-    {
-      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
-      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
-    }
-
-  const unsigned m_default_dependence_capacity ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  TaskPolicy() : m_default_dependence_capacity(4) {}
-
-  KOKKOS_INLINE_FUNCTION
-  TaskPolicy( const TaskPolicy & rhs ) : m_default_dependence_capacity( rhs.m_default_dependence_capacity ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  explicit
-  TaskPolicy( const unsigned arg_default_dependence_capacity )
-    : m_default_dependence_capacity( arg_default_dependence_capacity ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  TaskPolicy( const TaskPolicy &
-            , const unsigned arg_default_dependence_capacity )
-    : m_default_dependence_capacity( arg_default_dependence_capacity ) {}
-
-  //----------------------------------------
-
-  template< class ValueType >
-  KOKKOS_INLINE_FUNCTION
-  const Future< ValueType , execution_space > &
-    spawn( const Future< ValueType , execution_space > & f ) const
-      {
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        f.m_task->schedule();
-#endif
-        return f ;
-      }
-
-  // Create single-thread task
-
-  template< class FunctorType >
-  KOKKOS_INLINE_FUNCTION
-  Future< typename FunctorType::value_type , execution_space >
-  create( const FunctorType & functor
-        , const unsigned dependence_capacity = ~0u ) const
-    {
-      typedef typename FunctorType::value_type value_type ;
-      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
-      return Future< value_type , execution_space >(
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        task_root_type::create< task_type >(
-          functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
-#endif
-        );
-    }
-
-  // Create parallel foreach task
-
-  template< class PolicyType , class FunctorType >
-  KOKKOS_INLINE_FUNCTION
-  Future< typename FunctorType::value_type , execution_space >
-  create_foreach( const PolicyType  & policy
-                , const FunctorType & functor
-                , const unsigned      dependence_capacity = ~0u ) const
-    {
-      typedef typename FunctorType::value_type value_type ;
-      typedef Impl::TaskForEach< PolicyType , value_type , FunctorType > task_type ;
-      return Future< value_type , execution_space >(
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        task_root_type::create< task_type >( policy , functor ,
-          ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
-#endif
-       );
-    }
-
-  // Create parallel reduce task
-
-  template< class PolicyType , class FunctorType >
-  KOKKOS_INLINE_FUNCTION
-  Future< typename FunctorType::value_type , execution_space >
-  create_reduce( const PolicyType  & policy
-               , const FunctorType & functor
-               , const unsigned      dependence_capacity = ~0u ) const
-    {
-      typedef typename FunctorType::value_type value_type ;
-      typedef Impl::TaskReduce< PolicyType , value_type , FunctorType > task_type ;
-      return Future< value_type , execution_space >(
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        task_root_type::create< task_type >( policy , functor ,
-          ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
-#endif
-        );
-    }
-
-  // Add dependence
-  template< class A1 , class A2 , class A3 , class A4 >
-  KOKKOS_INLINE_FUNCTION
-  void add_dependence( const Future<A1,A2> & after
-                     , const Future<A3,A4> & before
-                     , typename Impl::enable_if
-                        < Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
-                          &&
-                          Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
-                        >::type * = 0
-                      ) const
-    {
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      after.m_task->add_dependence( before.m_task );
-#endif
-    }
-
-  //----------------------------------------
-  // Functions for an executing task functor to query dependences,
-  // set new dependences, and respawn itself.
-
-  template< class FunctorType >
-  KOKKOS_INLINE_FUNCTION
-  Future< void , execution_space >
-  get_dependence( const FunctorType * task_functor , int i ) const
-    {
-      return Future<void,execution_space>(
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        get_task_root(task_functor)->get_dependence(i)
-#endif
-        );
-    }
-
-  template< class FunctorType >
-  KOKKOS_INLINE_FUNCTION
-  int get_dependence( const FunctorType * task_functor ) const
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return get_task_root(task_functor)->get_dependence(); }
-#else
-    { return 0 ; }
-#endif
-
-  template< class FunctorType >
-  KOKKOS_INLINE_FUNCTION
-  void clear_dependence( FunctorType * task_functor ) const
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { get_task_root(task_functor)->clear_dependence(); }
-#else
-    {}
-#endif
-
-  template< class FunctorType , class A3 , class A4 >
-  KOKKOS_INLINE_FUNCTION
-  void add_dependence( FunctorType * task_functor
-                     , const Future<A3,A4> & before
-                     , typename Impl::enable_if
-                        < Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
-                        >::type * = 0
-                      ) const
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { get_task_root(task_functor)->add_dependence( before.m_task ); }
-#else
-    {}
-#endif
-
-  template< class FunctorType >
-  KOKKOS_INLINE_FUNCTION
-  void respawn( FunctorType * task_functor ) const
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { get_task_root(task_functor)->schedule(); }
-#else
-    {}
-#endif
-};
-
-inline
-void wait( TaskPolicy< Kokkos::Serial > & )
-{ Impl::TaskMember< Kokkos::Serial , void , void >::execute_ready_tasks(); }
-
-inline
-void wait( const Future< void , Kokkos::Serial > & future )
-{ Impl::TaskMember< Kokkos::Serial , void , void >::wait( future ); }
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-#endif /* defined( KOKKOS_HAVE_SERIAL ) */
-#endif /* #define KOKKOS_SERIAL_TASK_HPP */
-
--- a/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
@ -1,178 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-
-#include <sstream>
-#include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_Shape.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-void assert_counts_are_equal_throw(
-  const size_t x_count ,
-  const size_t y_count )
-{
-  std::ostringstream msg ;
-
-  msg << "Kokkos::Impl::assert_counts_are_equal_throw( "
-      << x_count << " != " << y_count << " )" ;
-
-  throw_runtime_exception( msg.str() );
-}
-
-void assert_shapes_are_equal_throw(
-  const unsigned x_scalar_size ,
-  const unsigned x_rank ,
-  const size_t   x_N0 , const unsigned x_N1 ,
-  const unsigned x_N2 , const unsigned x_N3 ,
-  const unsigned x_N4 , const unsigned x_N5 ,
-  const unsigned x_N6 , const unsigned x_N7 ,
-
-  const unsigned y_scalar_size ,
-  const unsigned y_rank ,
-  const size_t   y_N0 , const unsigned y_N1 ,
-  const unsigned y_N2 , const unsigned y_N3 ,
-  const unsigned y_N4 , const unsigned y_N5 ,
-  const unsigned y_N6 , const unsigned y_N7 )
-{
-  std::ostringstream msg ;
-
-  msg << "Kokkos::Impl::assert_shape_are_equal_throw( {"
-      << " scalar_size(" << x_scalar_size
-      << ") rank(" << x_rank
-      << ") dimension(" ;
-  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
-  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
-  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
-  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
-  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
-  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
-  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
-  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
-  msg << " ) } != { "
-      << " scalar_size(" << y_scalar_size
-      << ") rank(" << y_rank
-      << ") dimension(" ;
-  if ( 0 < y_rank ) { msg << " " << y_N0 ; }
-  if ( 1 < y_rank ) { msg << " " << y_N1 ; }
-  if ( 2 < y_rank ) { msg << " " << y_N2 ; }
-  if ( 3 < y_rank ) { msg << " " << y_N3 ; }
-  if ( 4 < y_rank ) { msg << " " << y_N4 ; }
-  if ( 5 < y_rank ) { msg << " " << y_N5 ; }
-  if ( 6 < y_rank ) { msg << " " << y_N6 ; }
-  if ( 7 < y_rank ) { msg << " " << y_N7 ; }
-  msg << " ) } )" ;
-
-  throw_runtime_exception( msg.str() );
-}
-
-void AssertShapeBoundsAbort< Kokkos::HostSpace >::apply(
-  const size_t rank ,
-  const size_t n0 , const size_t n1 , 
-  const size_t n2 , const size_t n3 ,
-  const size_t n4 , const size_t n5 ,
-  const size_t n6 , const size_t n7 ,
-
-  const size_t arg_rank ,
-  const size_t i0 , const size_t i1 ,
-  const size_t i2 , const size_t i3 ,
-  const size_t i4 , const size_t i5 ,
-  const size_t i6 , const size_t i7 )
-{
-  std::ostringstream msg ;
-  msg << "Kokkos::Impl::AssertShapeBoundsAbort( shape = {" ;
-  if ( 0 < rank ) { msg << " " << n0 ; }
-  if ( 1 < rank ) { msg << " " << n1 ; }
-  if ( 2 < rank ) { msg << " " << n2 ; }
-  if ( 3 < rank ) { msg << " " << n3 ; }
-  if ( 4 < rank ) { msg << " " << n4 ; }
-  if ( 5 < rank ) { msg << " " << n5 ; }
-  if ( 6 < rank ) { msg << " " << n6 ; }
-  if ( 7 < rank ) { msg << " " << n7 ; }
-  msg << " } index = {" ;
-  if ( 0 < arg_rank ) { msg << " " << i0 ; }
-  if ( 1 < arg_rank ) { msg << " " << i1 ; }
-  if ( 2 < arg_rank ) { msg << " " << i2 ; }
-  if ( 3 < arg_rank ) { msg << " " << i3 ; }
-  if ( 4 < arg_rank ) { msg << " " << i4 ; }
-  if ( 5 < arg_rank ) { msg << " " << i5 ; }
-  if ( 6 < arg_rank ) { msg << " " << i6 ; }
-  if ( 7 < arg_rank ) { msg << " " << i7 ; }
-  msg << " } )" ;
-
-  throw_runtime_exception( msg.str() );
-}
-
-void assert_shape_effective_rank1_at_leastN_throw(
-  const size_t x_rank , const size_t x_N0 ,
-  const size_t x_N1 ,   const size_t x_N2 ,
-  const size_t x_N3 ,   const size_t x_N4 ,
-  const size_t x_N5 ,   const size_t x_N6 ,
-  const size_t x_N7 ,
-  const size_t N0 )
-{
-  std::ostringstream msg ;
-
-  msg << "Kokkos::Impl::assert_shape_effective_rank1_at_leastN_throw( shape = {" ;
-  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
-  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
-  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
-  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
-  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
-  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
-  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
-  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
-  msg << " } N = " << N0 << " )" ;
-
-  throw_runtime_exception( msg.str() );
-}
-
-
-
-}
-}
-
--- a/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
@ -1,917 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_SHAPE_HPP
-#define KOKKOS_SHAPE_HPP
-
-#include <typeinfo>
-#include <utility>
-#include <Kokkos_Core_fwd.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_StaticAssert.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-/** \brief  The shape of a Kokkos with dynamic and static dimensions.
- *          Dynamic dimensions are member values and static dimensions are
- *          'static const' values.
- *
- *  The upper bound on the array rank is eight.
- */
-template< unsigned ScalarSize ,
-          unsigned Rank ,
-          unsigned s0  = 1 ,
-          unsigned s1  = 1 ,
-          unsigned s2  = 1 ,
-          unsigned s3  = 1 ,
-          unsigned s4  = 1 ,
-          unsigned s5  = 1 ,
-          unsigned s6  = 1 ,
-          unsigned s7  = 1 >
-struct Shape ;
-
-//----------------------------------------------------------------------------
-/** \brief  Shape equality if the value type, layout, and dimensions
- *          are equal.
- */
-template< unsigned xSize , unsigned xRank ,
-          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
-          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
-
-          unsigned ySize , unsigned yRank ,
-          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
-          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
-KOKKOS_INLINE_FUNCTION
-bool operator == ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
-                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
-{
-  enum { same_size = xSize == ySize };
-  enum { same_rank = xRank == yRank };
-
-  return same_size && same_rank &&
-         size_t( x.N0 )   == size_t( y.N0 ) &&
-         unsigned( x.N1 ) == unsigned( y.N1 ) &&
-         unsigned( x.N2 ) == unsigned( y.N2 ) &&
-         unsigned( x.N3 ) == unsigned( y.N3 ) &&
-         unsigned( x.N4 ) == unsigned( y.N4 ) &&
-         unsigned( x.N5 ) == unsigned( y.N5 ) &&
-         unsigned( x.N6 ) == unsigned( y.N6 ) &&
-         unsigned( x.N7 ) == unsigned( y.N7 ) ;
-}
-
-template< unsigned xSize , unsigned xRank ,
-          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
-          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
-
-          unsigned ySize ,unsigned yRank ,
-          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
-          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
-KOKKOS_INLINE_FUNCTION
-bool operator != ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
-                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
-{ return ! operator == ( x , y ); }
-
-//----------------------------------------------------------------------------
-
-void assert_counts_are_equal_throw(
-  const size_t x_count ,
-  const size_t y_count );
-
-inline
-void assert_counts_are_equal(
-  const size_t x_count ,
-  const size_t y_count )
-{
-  if ( x_count != y_count ) {
-    assert_counts_are_equal_throw( x_count , y_count );
-  }
-}
-
-void assert_shapes_are_equal_throw(
-  const unsigned x_scalar_size ,
-  const unsigned x_rank ,
-  const size_t   x_N0 , const unsigned x_N1 ,
-  const unsigned x_N2 , const unsigned x_N3 ,
-  const unsigned x_N4 , const unsigned x_N5 ,
-  const unsigned x_N6 , const unsigned x_N7 ,
-
-  const unsigned y_scalar_size ,
-  const unsigned y_rank ,
-  const size_t   y_N0 , const unsigned y_N1 ,
-  const unsigned y_N2 , const unsigned y_N3 ,
-  const unsigned y_N4 , const unsigned y_N5 ,
-  const unsigned y_N6 , const unsigned y_N7 );
-
-template< unsigned xSize , unsigned xRank ,
-          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
-          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
-
-          unsigned ySize , unsigned yRank ,
-          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
-          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
-inline
-void assert_shapes_are_equal(
-  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
-  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
-{
-  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
-  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
-
-  if ( x != y ) {
-    assert_shapes_are_equal_throw(
-      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
-      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
-  }
-}
-
-template< unsigned xSize , unsigned xRank ,
-          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
-          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
-
-          unsigned ySize , unsigned yRank ,
-          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
-          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
-void assert_shapes_equal_dimension(
-  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
-  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
-{
-  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
-  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
-
-  // Omit comparison of scalar_size.
-  if ( unsigned( x.rank ) != unsigned( y.rank ) ||
-       size_t( x.N0 )   != size_t( y.N0 ) || 
-       unsigned( x.N1 ) != unsigned( y.N1 ) || 
-       unsigned( x.N2 ) != unsigned( y.N2 ) || 
-       unsigned( x.N3 ) != unsigned( y.N3 ) ||
-       unsigned( x.N4 ) != unsigned( y.N4 ) || 
-       unsigned( x.N5 ) != unsigned( y.N5 ) || 
-       unsigned( x.N6 ) != unsigned( y.N6 ) || 
-       unsigned( x.N7 ) != unsigned( y.N7 ) ) {
-    assert_shapes_are_equal_throw(
-      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
-      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
-  }
-}
-
-//----------------------------------------------------------------------------
-
-template< class ShapeType > struct assert_shape_is_rank_zero ;
-template< class ShapeType > struct assert_shape_is_rank_one ;
-
-template< unsigned Size >
-struct assert_shape_is_rank_zero< Shape<Size,0> >
-  : public true_type {};
-
-template< unsigned Size , unsigned s0 >
-struct assert_shape_is_rank_one< Shape<Size,1,s0> >
-  : public true_type {};
-
-//----------------------------------------------------------------------------
-
-/** \brief  Array bounds assertion templated on the execution space
- *          to allow device-specific abort code.
- */
-template< class Space >
-struct AssertShapeBoundsAbort ;
-
-template<>
-struct AssertShapeBoundsAbort< Kokkos::HostSpace >
-{
-  static void apply( const size_t rank ,
-                     const size_t n0 , const size_t n1 ,
-                     const size_t n2 , const size_t n3 ,
-                     const size_t n4 , const size_t n5 ,
-                     const size_t n6 , const size_t n7 ,
-                     const size_t arg_rank ,
-                     const size_t i0 , const size_t i1 ,
-                     const size_t i2 , const size_t i3 ,
-                     const size_t i4 , const size_t i5 ,
-                     const size_t i6 , const size_t i7 );
-};
-
-template< class ExecutionSpace >
-struct AssertShapeBoundsAbort
-{
-  KOKKOS_INLINE_FUNCTION
-  static void apply( const size_t rank ,
-                     const size_t n0 , const size_t n1 ,
-                     const size_t n2 , const size_t n3 ,
-                     const size_t n4 , const size_t n5 ,
-                     const size_t n6 , const size_t n7 ,
-                     const size_t arg_rank ,
-                     const size_t i0 , const size_t i1 ,
-                     const size_t i2 , const size_t i3 ,
-                     const size_t i4 , const size_t i5 ,
-                     const size_t i6 , const size_t i7 )
-    {
-      AssertShapeBoundsAbort< Kokkos::HostSpace >
-        ::apply( rank ,    n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 ,
-                 arg_rank, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
-    }
-};
-
-template< class ShapeType >
-KOKKOS_INLINE_FUNCTION
-void assert_shape_bounds( const ShapeType & shape ,
-                          const size_t arg_rank ,
-                          const size_t i0 ,
-                          const size_t i1 = 0 ,
-                          const size_t i2 = 0 ,
-                          const size_t i3 = 0 ,
-                          const size_t i4 = 0 ,
-                          const size_t i5 = 0 ,
-                          const size_t i6 = 0 ,
-                          const size_t i7 = 0 )
-{
-  // Must supply at least as many indices as ranks.
-  // Every index must be within bounds.
-  const bool ok = ShapeType::rank <= arg_rank &&
-                  i0 < shape.N0 && 
-                  i1 < shape.N1 &&
-                  i2 < shape.N2 &&
-                  i3 < shape.N3 &&
-                  i4 < shape.N4 &&
-                  i5 < shape.N5 &&
-                  i6 < shape.N6 &&
-                  i7 < shape.N7 ;
-
-  if ( ! ok ) {
-    AssertShapeBoundsAbort< Kokkos::Impl::ActiveExecutionMemorySpace >
-      ::apply( ShapeType::rank ,
-               shape.N0 , shape.N1 , shape.N2 , shape.N3 ,
-               shape.N4 , shape.N5 , shape.N6 , shape.N7 ,
-               arg_rank , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
-  }
-}
-
-#if defined( KOKKOS_EXPRESSION_CHECK )
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) assert_shape_bounds(S,1,I0);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) assert_shape_bounds(S,2,I0,I1);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) assert_shape_bounds(S,3,I0,I1,I2);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) assert_shape_bounds(S,4,I0,I1,I2,I3);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) assert_shape_bounds(S,5,I0,I1,I2,I3,I4);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) assert_shape_bounds(S,6,I0,I1,I2,I3,I4,I5);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) assert_shape_bounds(S,7,I0,I1,I2,I3,I4,I5,I6);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) assert_shape_bounds(S,8,I0,I1,I2,I3,I4,I5,I6,I7);
-#else
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) /* */
-#endif
-
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-// Specialization and optimization for the Rank 0 shape.
-
-template < unsigned ScalarSize >
-struct Shape< ScalarSize , 0, 1,1,1,1, 1,1,1,1 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 0 };
-  enum { rank         = 0 };
-
-  enum { N0 = 1 };
-  enum { N1 = 1 };
-  enum { N2 = 1 };
-  enum { N3 = 1 };
-  enum { N4 = 1 };
-  enum { N5 = 1 };
-  enum { N6 = 1 };
-  enum { N7 = 1 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  {}
-};
-
-//----------------------------------------------------------------------------
-
-template< unsigned R > struct assign_shape_dimension ;
-
-#define KOKKOS_ASSIGN_SHAPE_DIMENSION( R ) \
-template<> \
-struct assign_shape_dimension< R > \
-{ \
-  template< class ShapeType > \
-  KOKKOS_INLINE_FUNCTION \
-  assign_shape_dimension( ShapeType & shape \
-                        , typename Impl::enable_if<( R < ShapeType::rank_dynamic ), size_t >::type n \
-                        ) { shape.N ## R = n ; } \
-};
-
-KOKKOS_ASSIGN_SHAPE_DIMENSION(0)
-KOKKOS_ASSIGN_SHAPE_DIMENSION(1)
-KOKKOS_ASSIGN_SHAPE_DIMENSION(2)
-KOKKOS_ASSIGN_SHAPE_DIMENSION(3)
-KOKKOS_ASSIGN_SHAPE_DIMENSION(4)
-KOKKOS_ASSIGN_SHAPE_DIMENSION(5)
-KOKKOS_ASSIGN_SHAPE_DIMENSION(6)
-KOKKOS_ASSIGN_SHAPE_DIMENSION(7)
-
-#undef KOKKOS_ASSIGN_SHAPE_DIMENSION
-
-//----------------------------------------------------------------------------
-// All-static dimension array
-
-template < unsigned ScalarSize ,
-           unsigned Rank ,
-           unsigned s0 ,
-           unsigned s1 ,
-           unsigned s2 ,
-           unsigned s3 ,
-           unsigned s4 ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape {
-
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 0 };
-  enum { rank         = Rank };
-
-  enum { N0 = s0 };
-  enum { N1 = s1 };
-  enum { N2 = s2 };
-  enum { N3 = s3 };
-  enum { N4 = s4 };
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  {}
-};
-
-// 1 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize ,
-           unsigned Rank ,
-           unsigned s1 ,
-           unsigned s2 ,
-           unsigned s3 ,
-           unsigned s4 ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,s1,s2,s3, s4,s5,s6,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 1 };
-  enum { rank         = Rank };
-
-  size_t N0 ; // For 1 == dynamic_rank allow  N0 > 2^32
-
-  enum { N1 = s1 };
-  enum { N2 = s2 };
-  enum { N3 = s3 };
-  enum { N4 = s4 };
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               size_t n0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  { s.N0 = n0 ; }
-};
-
-// 2 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize , unsigned Rank ,
-           unsigned s2 ,
-           unsigned s3 ,
-           unsigned s4 ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,0,s2,s3, s4,s5,s6,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 2 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-
-  enum { N2 = s2 };
-  enum { N3 = s3 };
-  enum { N4 = s4 };
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned = 0 , unsigned = 0 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  { s.N0 = n0 ; s.N1 = n1 ; }
-};
-
-// 3 == dynamic_rank <= rank <= 8
-template < unsigned Rank , unsigned ScalarSize ,
-           unsigned s3 ,
-           unsigned s4 ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,0,0,s3, s4,s5,s6,s7>
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 3 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-
-  enum { N3 = s3 };
-  enum { N4 = s4 };
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned = 0 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; }
-};
-
-// 4 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize , unsigned Rank ,
-           unsigned s4 ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank, 0,0,0,0, s4,s5,s6,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 4 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-  unsigned N3 ;
-
-  enum { N4 = s4 };
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; }
-};
-
-// 5 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize , unsigned Rank ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,0,0,0, 0,s5,s6,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 5 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-  unsigned N3 ;
-  unsigned N4 ;
-
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
-               unsigned n4 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; s.N4 = n4 ; }
-};
-
-// 6 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize , unsigned Rank ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,s6,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 6 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-  unsigned N3 ;
-  unsigned N4 ;
-  unsigned N5 ;
-
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
-               unsigned n4 , unsigned n5 = 0 , unsigned = 0 , unsigned = 0 )
-  {
-    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
-    s.N4 = n4 ; s.N5 = n5 ;
-  }
-};
-
-// 7 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize , unsigned Rank ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,0,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 7 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-  unsigned N3 ;
-  unsigned N4 ;
-  unsigned N5 ;
-  unsigned N6 ;
-
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
-               unsigned n4 , unsigned n5 , unsigned n6 , unsigned = 0 )
-  {
-    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
-    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ;
-  }
-};
-
-// 8 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize >
-struct Shape< ScalarSize , 8 , 0,0,0,0, 0,0,0,0 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 8 };
-  enum { rank         = 8 };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-  unsigned N3 ;
-  unsigned N4 ;
-  unsigned N5 ;
-  unsigned N6 ;
-  unsigned N7 ;
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
-               unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7 )
-  {
-    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
-    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ; s.N7 = n7 ;
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< class ShapeType , unsigned N ,
-          unsigned R = ShapeType::rank_dynamic >
-struct ShapeInsert ;
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 0 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 N ,
-                 ShapeType::N0 ,
-                 ShapeType::N1 ,
-                 ShapeType::N2 ,
-                 ShapeType::N3 ,
-                 ShapeType::N4 ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 1 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 N ,
-                 ShapeType::N1 ,
-                 ShapeType::N2 ,
-                 ShapeType::N3 ,
-                 ShapeType::N4 ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 2 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 N ,
-                 ShapeType::N2 ,
-                 ShapeType::N3 ,
-                 ShapeType::N4 ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 3 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 N ,
-                 ShapeType::N3 ,
-                 ShapeType::N4 ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 4 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 N ,
-                 ShapeType::N4 ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 5 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 N ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 6 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 N ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 7 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 N > type ;
-};
-
-//----------------------------------------------------------------------------
-
-template< class DstShape , class SrcShape ,
-          unsigned DstRankDynamic   = DstShape::rank_dynamic ,
-          bool     DstRankDynamicOK = unsigned(DstShape::rank_dynamic) >= unsigned(SrcShape::rank_dynamic) >
-struct ShapeCompatible { enum { value = false }; };
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 8 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 7 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 6 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 5 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 4 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 3 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
-                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 2 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
-                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
-                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 1 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
-                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
-                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
-                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 0 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N0) == unsigned(SrcShape::N0) &&
-                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
-                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
-                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
-                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< unsigned ScalarSize , unsigned Rank ,
-          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
-          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 ,
-          typename iType >
-KOKKOS_INLINE_FUNCTION
-size_t dimension( 
-  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape ,
-  const iType & r )
-{
-  return 0 == r ? shape.N0 : (
-         1 == r ? shape.N1 : (
-         2 == r ? shape.N2 : (
-         3 == r ? shape.N3 : (
-         4 == r ? shape.N4 : (
-         5 == r ? shape.N5 : (
-         6 == r ? shape.N6 : (
-         7 == r ? shape.N7 : 1 )))))));
-}
-
-template< unsigned ScalarSize , unsigned Rank ,
-          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
-          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 >
-KOKKOS_INLINE_FUNCTION
-size_t cardinality_count(
-  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape )
-{
-  return size_t(shape.N0) * shape.N1 * shape.N2 * shape.N3 *
-         shape.N4 * shape.N5 * shape.N6 * shape.N7 ;
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_CORESHAPE_HPP */
-
--- a/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
@ -1,79 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_STATICASSERT_HPP
-#define KOKKOS_STATICASSERT_HPP
-
-namespace Kokkos {
-namespace Impl {
-
-template < bool , class T = void >
-struct StaticAssert ;
-
-template< class T >
-struct StaticAssert< true , T > {
-  typedef T type ;
-  static const bool value = true ;
-};
-
-template < class A , class B >
-struct StaticAssertSame ;
-
-template < class A >
-struct StaticAssertSame<A,A> { typedef A type ; };
-
-template < class A , class B >
-struct StaticAssertAssignable ;
-
-template < class A >
-struct StaticAssertAssignable<A,A> { typedef A type ; };
-
-template < class A >
-struct StaticAssertAssignable< const A , A > { typedef const A type ; };
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* KOKKOS_STATICASSERT_HPP */
-
-
--- a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
@ -1,131 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TAGS_HPP
-#define KOKKOS_TAGS_HPP
-
-#include <impl/Kokkos_Traits.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-struct LayoutTag {};
-
-struct MemorySpaceTag {};
-struct MemoryTraitsTag {};
-
-struct ExecutionPolicyTag {};
-struct ExecutionSpaceTag {};
-
-
-template< class C , class Enable = void >
-struct is_memory_space : public bool_< false > {};
-
-template< class C , class Enable = void >
-struct is_execution_space : public bool_< false > {};
-
-template< class C , class Enable = void >
-struct is_execution_policy : public bool_< false > {};
-
-template< class C , class Enable = void >
-struct is_array_layout : public Impl::false_type {};
-
-template< class C , class Enable = void >
-struct is_memory_traits : public Impl::false_type {};
-
-
-template< class C >
-struct is_memory_space< C , typename Impl::enable_if_type< typename C::memory_space >::type >
-  : public bool_< Impl::is_same< C , typename C::memory_space >::value > {};
-
-template< class C >
-struct is_execution_space< C , typename Impl::enable_if_type< typename C::execution_space >::type >
-  : public bool_< Impl::is_same< C , typename C::execution_space >::value > {};
-
-template< class C >
-struct is_execution_policy< C , typename Impl::enable_if_type< typename C::execution_policy >::type >
-  : public bool_< Impl::is_same< C , typename C::execution_policy >::value > {};
-
-template< class C >
-struct is_array_layout< C , typename Impl::enable_if_type< typename C::array_layout >::type >
-  : public bool_< Impl::is_same< C , typename C::array_layout >::value > {};
-
-template< class C >
-struct is_memory_traits< C , typename Impl::enable_if_type< typename C::memory_traits >::type >
-  : public bool_< Impl::is_same< C , typename C::memory_traits >::value > {};
-
-//----------------------------------------------------------------------------
-
-template< class C , class Enable = void >
-struct is_space : public Impl::false_type {};
-
-template< class C >
-struct is_space< C
-                 , typename Impl::enable_if<(
-                     Impl::is_same< C , typename C::execution_space >::value ||
-                     Impl::is_same< C , typename C::memory_space    >::value
-                   )>::type
-                 >
-  : public Impl::true_type
-{
-  typedef typename C::execution_space  execution_space ;
-  typedef typename C::memory_space     memory_space ;
-
-  // The host_mirror_space defines a space with host-resident memory.
-  // If the execution space's memory space is HostSpace then use that execution space.
-  // Else use the HostSpace.
-  typedef
-    typename Impl::if_c< Impl::is_same< typename execution_space::memory_space , HostSpace >::value , execution_space ,
-    HostSpace >::type
-      host_mirror_space ;
-};
-
-}
-}
-
-#endif
--- a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
@ -1,115 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_IMPLWALLTIME_HPP
-#define KOKKOS_IMPLWALLTIME_HPP
-
-#include <stddef.h>
-
-#ifdef _MSC_VER
-#undef KOKKOS_USE_LIBRT
-#include <gettimeofday.c>
-#else
-#ifdef KOKKOS_USE_LIBRT
-#include <ctime>
-#else
-#include <sys/time.h>
-#endif
-#endif
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  Time since construction */
-
-class Timer {
-private:
-  #ifdef KOKKOS_USE_LIBRT
-	struct timespec m_old;
-  #else
-	struct timeval m_old ;
-  #endif
-  Timer( const Timer & );
-  Timer & operator = ( const Timer & );
-public:
-
-  inline
-  void reset() {
-    #ifdef KOKKOS_USE_LIBRT
-	  clock_gettime(CLOCK_REALTIME, &m_old);
-    #else
-	  gettimeofday( & m_old , ((struct timezone *) NULL ) );
-    #endif
-  }
-
-  inline
-  ~Timer() {}
-
-  inline
-  Timer() { reset(); }
-
-  inline
-  double seconds() const
-  {
-    #ifdef KOKKOS_USE_LIBRT
-      struct timespec m_new;
-      clock_gettime(CLOCK_REALTIME, &m_new);
-
-      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
-             ( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 );
-    #else
-      struct timeval m_new ;
-
-      ::gettimeofday( & m_new , ((struct timezone *) NULL ) );
-
-      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
-             ( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 );
-    #endif
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
-
--- a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
@ -1,370 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOSTRAITS_HPP
-#define KOKKOSTRAITS_HPP
-
-#include <stddef.h>
-#include <stdint.h>
-#include <Kokkos_Macros.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-/* C++11 conformal compile-time type traits utilities.
- * Prefer to use C++11 when portably available.
- */
-//----------------------------------------------------------------------------
-// C++11 Helpers:
-
-template < class T , T v >
-struct integral_constant
-{
-  // Declaration of 'static const' causes an unresolved linker symbol in debug
-  // static const T value = v ;
-  enum { value = T(v) };
-  typedef T value_type;
-  typedef integral_constant<T,v> type;
-  KOKKOS_INLINE_FUNCTION operator T() { return v ; }
-};
-
-typedef integral_constant<bool,false> false_type ;
-typedef integral_constant<bool,true>  true_type ;
-
-//----------------------------------------------------------------------------
-// C++11 Type relationships:
-
-template< class X , class Y > struct is_same : public false_type {};
-template< class X >           struct is_same<X,X> : public true_type {};
-
-//----------------------------------------------------------------------------
-// C++11 Type properties:
-
-template <typename T> struct is_const : public false_type {};
-template <typename T> struct is_const<const T> : public true_type {};
-template <typename T> struct is_const<const T & > : public true_type {};
-
-template <typename T> struct is_array : public false_type {};
-template <typename T> struct is_array< T[] > : public true_type {};
-template <typename T, unsigned N > struct is_array< T[N] > : public true_type {};
-
-//----------------------------------------------------------------------------
-// C++11 Type transformations:
-
-template <typename T> struct remove_const { typedef T type; };
-template <typename T> struct remove_const<const T> { typedef T type; };
-template <typename T> struct remove_const<const T & > { typedef T & type; };
-
-template <typename T> struct add_const { typedef const T type; };
-template <typename T> struct add_const<T & > { typedef const T & type; };
-template <typename T> struct add_const<const T> { typedef const T type; };
-template <typename T> struct add_const<const T & > { typedef const T & type; };
-
-template <typename T> struct remove_reference { typedef T type ; };
-template <typename T> struct remove_reference< T & > { typedef T type ; };
-template <typename T> struct remove_reference< const T & > { typedef const T type ; };
-
-template <typename T> struct remove_extent { typedef T type ; };
-template <typename T> struct remove_extent<T[]> { typedef T type ; };
-template <typename T, unsigned N > struct remove_extent<T[N]> { typedef T type ; };
-
-//----------------------------------------------------------------------------
-// C++11 Other type generators:
-
-template< bool , class T , class F >
-struct condition { typedef F type ; };
-
-template< class T , class F >
-struct condition<true,T,F> { typedef T type ; };
-
-template< bool , class = void >
-struct enable_if ;
-
-template< class T >
-struct enable_if< true , T > { typedef T type ; };
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-// Other traits
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-template< class , class T = void >
-struct enable_if_type { typedef T type ; };
-
-//----------------------------------------------------------------------------
-
-template< bool B >
-struct bool_ : public integral_constant<bool,B> {};
-
-template< unsigned I >
-struct unsigned_ : public integral_constant<unsigned,I> {};
-
-template< int I >
-struct int_ : public integral_constant<int,I> {};
-
-typedef bool_<true> true_;
-typedef bool_<false> false_;
-//----------------------------------------------------------------------------
-// if_
-
-template < bool Cond , typename TrueType , typename FalseType>
-struct if_c
-{
-  enum { value = Cond };
-
-  typedef FalseType type;
-
-
-  typedef typename remove_const<
-          typename remove_reference<type>::type >::type value_type ;
-
-  typedef typename add_const<value_type>::type const_value_type ;
-
-  static KOKKOS_INLINE_FUNCTION
-  const_value_type & select( const_value_type & v ) { return v ; }
-
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( value_type & v ) { return v ; }
-
-  template< class T >
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
-
-
-  template< class T >
-  static KOKKOS_INLINE_FUNCTION
-  const_value_type & select( const T & , const_value_type & v ) { return v ; }
-
-  template< class T >
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( const T & , value_type & v ) { return v ; }
-};
-
-template <typename TrueType, typename FalseType>
-struct if_c< true , TrueType , FalseType >
-{
-  enum { value = true };
-
-  typedef TrueType type;
-
-
-  typedef typename remove_const<
-          typename remove_reference<type>::type >::type value_type ;
-
-  typedef typename add_const<value_type>::type const_value_type ;
-
-  static KOKKOS_INLINE_FUNCTION
-  const_value_type & select( const_value_type & v ) { return v ; }
-
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( value_type & v ) { return v ; }
-
-  template< class T >
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
-
-
-  template< class F >
-  static KOKKOS_INLINE_FUNCTION
-  const_value_type & select( const_value_type & v , const F & ) { return v ; }
-
-  template< class F >
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( value_type & v , const F & ) { return v ; }
-};
-
-template< typename TrueType >
-struct if_c< false , TrueType , void >
-{
-  enum { value = false };
-
-  typedef void type ;
-  typedef void value_type ;
-};
-
-template< typename FalseType >
-struct if_c< true , void , FalseType >
-{
-  enum { value = true };
-
-  typedef void type ;
-  typedef void value_type ;
-};
-
-template <typename Cond, typename TrueType, typename FalseType>
-struct if_ : public if_c<Cond::value, TrueType, FalseType> {};
-
-//----------------------------------------------------------------------------
-
-// Allows aliased types:
-template< typename T >
-struct is_integral : public integral_constant< bool ,
-  (
-    Impl::is_same< T ,          char >::value ||
-    Impl::is_same< T , unsigned char >::value ||
-    Impl::is_same< T ,          short int >::value ||
-    Impl::is_same< T , unsigned short int >::value ||
-    Impl::is_same< T ,          int >::value ||
-    Impl::is_same< T , unsigned int >::value ||
-    Impl::is_same< T ,          long int >::value ||
-    Impl::is_same< T , unsigned long int >::value ||
-    Impl::is_same< T ,          long long int >::value ||
-    Impl::is_same< T , unsigned long long int >::value ||
-
-    Impl::is_same< T , int8_t   >::value ||
-    Impl::is_same< T , int16_t  >::value ||
-    Impl::is_same< T , int32_t  >::value ||
-    Impl::is_same< T , int64_t  >::value ||
-    Impl::is_same< T , uint8_t  >::value ||
-    Impl::is_same< T , uint16_t >::value ||
-    Impl::is_same< T , uint32_t >::value ||
-    Impl::is_same< T , uint64_t >::value 
-  )>
-{};
-
-//----------------------------------------------------------------------------
-
-
-template < size_t N >
-struct is_power_of_two
-{
-  enum type { value = (N > 0) && !(N & (N-1)) };
-};
-
-template < size_t N , bool OK = is_power_of_two<N>::value >
-struct power_of_two ;
-
-template < size_t N >
-struct power_of_two<N,true>
-{
-  enum type { value = 1+ power_of_two<(N>>1),true>::value };
-};
-
-template <>
-struct power_of_two<2,true>
-{
-  enum type { value = 1 };
-};
-
-template <>
-struct power_of_two<1,true>
-{
-  enum type { value = 0 };
-};
-
-/** \brief  If power of two then return power,
- *          otherwise return ~0u.
- */
-static KOKKOS_FORCEINLINE_FUNCTION
-unsigned power_of_two_if_valid( const unsigned N )
-{
-  unsigned p = ~0u ;
-  if ( N && ! ( N & ( N - 1 ) ) ) {
-#if defined( __CUDA_ARCH__ )
-    p = __ffs(N) - 1 ;
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-    p = __builtin_ffs(N) - 1 ;
-#elif defined( __INTEL_COMPILER )
-    p = _bit_scan_forward(N);
-#else
-    p = 0 ;
-    for ( unsigned j = 1 ; ! ( N & j ) ; j <<= 1 ) { ++p ; }
-#endif
-  }
-  return p ;
-}
-
-//----------------------------------------------------------------------------
-
-template< typename T , T v , bool NonZero = ( v != T(0) ) >
-struct integral_nonzero_constant
-{
-  // Declaration of 'static const' causes an unresolved linker symbol in debug
-  // static const T value = v ;
-  enum { value = T(v) };
-  typedef T value_type ;
-  typedef integral_nonzero_constant<T,v> type ;
-  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & ) {}
-};
-
-template< typename T , T zero >
-struct integral_nonzero_constant<T,zero,false>
-{
-  const T value ;
-  typedef T value_type ;
-  typedef integral_nonzero_constant<T,0> type ;
-  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & v ) : value(v) {}
-};
-
-//----------------------------------------------------------------------------
-
-template < class C > struct is_integral_constant : public false_
-{
-  typedef void integral_type ;
-  enum { integral_value = 0 };
-};
-
-template < typename T , T v >
-struct is_integral_constant< integral_constant<T,v> > : public true_
-{
-  typedef T integral_type ;
-  enum { integral_value = v };
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOSTRAITS_HPP */
-
--- a/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
--- a/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp
--- a/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
@ -1,541 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_VIEWSUPPORT_HPP
-#define KOKKOS_VIEWSUPPORT_HPP
-
-#include <Kokkos_ExecPolicy.hpp>
-#include <impl/Kokkos_Shape.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  Evaluate if LHS = RHS view assignment is allowed. */
-template< class ViewLHS , class ViewRHS >
-struct ViewAssignable
-{
-  // Same memory space.
-  // Same value type.
-  // Compatible 'const' qualifier
-  // Cannot assign managed = unmannaged
-  enum { assignable_value =
-    ( is_same< typename ViewLHS::value_type ,
-               typename ViewRHS::value_type >::value
-      ||
-      is_same< typename ViewLHS::value_type ,
-               typename ViewRHS::const_value_type >::value )
-    &&
-    is_same< typename ViewLHS::memory_space ,
-             typename ViewRHS::memory_space >::value
-    &&
-    ( ! ( ViewLHS::is_managed && ! ViewRHS::is_managed ) )
-  };
-
-  enum { assignable_shape =
-    // Compatible shape and matching layout:
-    ( ShapeCompatible< typename ViewLHS::shape_type ,
-                       typename ViewRHS::shape_type >::value
-      &&
-      is_same< typename ViewLHS::array_layout ,
-               typename ViewRHS::array_layout >::value )
-    ||
-    // Matching layout, same rank, and LHS dynamic rank
-    ( is_same< typename ViewLHS::array_layout ,
-               typename ViewRHS::array_layout >::value
-      &&
-      int(ViewLHS::rank) == int(ViewRHS::rank)
-      &&
-      int(ViewLHS::rank) == int(ViewLHS::rank_dynamic) )
-    ||
-    // Both rank-0, any shape and layout
-    ( int(ViewLHS::rank) == 0 && int(ViewRHS::rank) == 0 )
-    ||
-    // Both rank-1 and LHS is dynamic rank-1, any shape and layout
-    ( int(ViewLHS::rank) == 1 && int(ViewRHS::rank) == 1 &&
-      int(ViewLHS::rank_dynamic) == 1 )
-    };
-
-  enum { value = assignable_value && assignable_shape };
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class ExecSpace , class Type , bool Initialize >
-struct ViewDefaultConstruct
-{ ViewDefaultConstruct( Type * , size_t ) {} };
-
-
-/** \brief  ViewDataHandle provides the type of the 'data handle' which the view
- *          uses to access data with the [] operator. It also provides
- *          an allocate function and a function to extract a raw ptr from the
- *          data handle. ViewDataHandle also defines an enum ReferenceAble which
- *          specifies whether references/pointers to elements can be taken and a
- *          'return_type' which is what the view operators will give back.
- *          Specialisation of this object allows three things depending
- *          on ViewTraits and compiler options:
- *          (i)   Use special allocator (e.g. huge pages/small pages and pinned memory)
- *          (ii)  Use special data handle type (e.g. add Cuda Texture Object)
- *          (iii) Use special access intrinsics (e.g. texture fetch and non-caching loads)
- */
-template< class StaticViewTraits , class Enable = void >
-struct ViewDataHandle {
-
-  enum { ReturnTypeIsReference = true };
-
-  typedef typename StaticViewTraits::value_type * handle_type;
-  typedef typename StaticViewTraits::value_type & return_type;
-};
-
-template< class StaticViewTraits , class Enable = void >
-class ViewDataManagement : public ViewDataHandle< StaticViewTraits > {
-private:
-
-  template< class , class > friend class ViewDataManagement ;
-
-  struct PotentiallyManaged  {};
-  struct StaticallyUnmanaged {};
-
-  /* Statically unmanaged if traits or not executing in host-accessible memory space */
-  typedef typename
-    Impl::if_c< StaticViewTraits::is_managed &&
-                Impl::is_same< Kokkos::HostSpace
-                             , Kokkos::Impl::ActiveExecutionMemorySpace >::value
-              , PotentiallyManaged
-              , StaticallyUnmanaged
-              >::type StaticManagementTag ;
-
-  enum { Unmanaged     = 0x01
-       , Noncontiguous = 0x02
-       };
-
-  enum { DefaultTraits = Impl::is_same< StaticManagementTag , StaticallyUnmanaged >::value ? Unmanaged : 0 };
-
-  unsigned m_traits ; ///< Runtime traits
-
-
-  template< class T >
-  inline static
-  unsigned assign( const ViewDataManagement<T> & rhs , const PotentiallyManaged & )
-    { return rhs.m_traits | ( rhs.is_managed() && Kokkos::HostSpace::in_parallel() ? unsigned(Unmanaged) : 0u ); }
-
-  template< class T >
-  KOKKOS_INLINE_FUNCTION static
-  unsigned assign( const ViewDataManagement<T> & rhs , const StaticallyUnmanaged & )
-    { return rhs.m_traits | Unmanaged ; }
-
-  inline
-  void increment( const void * ptr , const PotentiallyManaged & ) const
-    { if ( is_managed() ) StaticViewTraits::memory_space::increment( ptr ); }
-  
-  inline
-  void decrement( const void * ptr , const PotentiallyManaged & ) const
-    { if ( is_managed() ) StaticViewTraits::memory_space::decrement( ptr ); }
-  
-  KOKKOS_INLINE_FUNCTION
-  void increment( const void * , const StaticallyUnmanaged & ) const {}
-  
-  KOKKOS_INLINE_FUNCTION
-  void decrement( const void * , const StaticallyUnmanaged & ) const {}
-
-public:
-
-  typedef typename ViewDataHandle< StaticViewTraits >::handle_type handle_type;
-
-  KOKKOS_INLINE_FUNCTION
-  ViewDataManagement() : m_traits( DefaultTraits ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ViewDataManagement( const ViewDataManagement & rhs )
-    : m_traits( assign( rhs , StaticManagementTag() ) ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ViewDataManagement & operator = ( const ViewDataManagement & rhs )
-    { m_traits = assign( rhs , StaticManagementTag() ); return *this ; }
-
-  template< class SVT >
-  KOKKOS_INLINE_FUNCTION
-  ViewDataManagement( const ViewDataManagement<SVT> & rhs )
-    : m_traits( assign( rhs , StaticManagementTag() ) ) {}
-
-  template< class SVT >
-  KOKKOS_INLINE_FUNCTION
-  ViewDataManagement & operator = ( const ViewDataManagement<SVT> & rhs )
-    { m_traits = assign( rhs , StaticManagementTag() ); return *this ; }
-
-  KOKKOS_INLINE_FUNCTION
-  bool is_managed() const { return ! ( m_traits & Unmanaged ); }
-
-  KOKKOS_INLINE_FUNCTION
-  bool is_contiguous() const { return ! ( m_traits & Noncontiguous ); }
-
-  KOKKOS_INLINE_FUNCTION
-  void set_unmanaged() { m_traits |= Unmanaged ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void set_noncontiguous() { m_traits |= Noncontiguous ; }
-
-
-  KOKKOS_INLINE_FUNCTION
-  void increment( handle_type handle ) const
-    { increment( ( typename StaticViewTraits::value_type *) handle , StaticManagementTag() ); }
-
-  KOKKOS_INLINE_FUNCTION
-  void decrement( handle_type handle ) const
-    { decrement( ( typename StaticViewTraits::value_type *) handle , StaticManagementTag() ); }
-
-
-  KOKKOS_INLINE_FUNCTION
-  void increment( const void * ptr ) const
-    { increment( ptr , StaticManagementTag() ); }
-
-  KOKKOS_INLINE_FUNCTION
-  void decrement( const void * ptr ) const
-    { decrement( ptr , StaticManagementTag() ); }
-
-
-  template< bool Initialize >
-  static
-  handle_type allocate( const std::string & label
-                      , const Impl::ViewOffset< typename StaticViewTraits::shape_type
-                                              , typename StaticViewTraits::array_layout > & offset_map )
-    {
-      typedef typename StaticViewTraits::execution_space  execution_space ;
-      typedef typename StaticViewTraits::memory_space     memory_space ;
-      typedef typename StaticViewTraits::value_type       value_type ;
-
-      const size_t count = offset_map.capacity();
-
-      value_type * ptr = (value_type*) memory_space::allocate( label , sizeof(value_type) * count );
-
-        // Default construct within the view's execution space.
-      (void) ViewDefaultConstruct< execution_space , value_type , Initialize >( ptr , count );
-
-      return typename ViewDataHandle< StaticViewTraits >::handle_type(ptr);
-    }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class OutputView , class InputView  , unsigned Rank = OutputView::Rank >
-struct ViewRemap
-{
-  typedef typename OutputView::size_type   size_type ;
-
-  const OutputView output ;
-  const InputView  input ;
-  const size_type n0 ;
-  const size_type n1 ;
-  const size_type n2 ;
-  const size_type n3 ;
-  const size_type n4 ;
-  const size_type n5 ;
-  const size_type n6 ;
-  const size_type n7 ;
-
-  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
-    : output( arg_out ), input( arg_in )
-    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
-    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
-    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
-    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
-    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
-    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
-    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
-    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
-    {
-      typedef typename OutputView::execution_space execution_space ;
-      Kokkos::RangePolicy< execution_space > range( 0 , n0 );
-      parallel_for( range , *this );
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i0 ) const
-  {
-    for ( size_type i1 = 0 ; i1 < n1 ; ++i1 ) {
-    for ( size_type i2 = 0 ; i2 < n2 ; ++i2 ) {
-    for ( size_type i3 = 0 ; i3 < n3 ; ++i3 ) {
-    for ( size_type i4 = 0 ; i4 < n4 ; ++i4 ) {
-    for ( size_type i5 = 0 ; i5 < n5 ; ++i5 ) {
-    for ( size_type i6 = 0 ; i6 < n6 ; ++i6 ) {
-    for ( size_type i7 = 0 ; i7 < n7 ; ++i7 ) {
-      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input.at(i0,i1,i2,i3,i4,i5,i6,i7);
-    }}}}}}}
-  }
-};
-
-template< class OutputView , class InputView  >
-struct ViewRemap< OutputView ,  InputView , 0 >
-{
-  typedef typename OutputView::value_type   value_type ;
-  typedef typename OutputView::memory_space dst_space ;
-  typedef typename InputView ::memory_space src_space ;
-
-  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
-  {
-    DeepCopy< dst_space , src_space >( arg_out.ptr_on_device() ,
-                                       arg_in.ptr_on_device() ,
-                                       sizeof(value_type) );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< class ExecSpace , class Type >
-struct ViewDefaultConstruct< ExecSpace , Type , true >
-{
-  Type * const m_ptr ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const typename ExecSpace::size_type i ) const
-    { new( m_ptr + i ) Type(); }
-
-  ViewDefaultConstruct( Type * pointer , size_t capacity )
-    : m_ptr( pointer )
-    {
-      Kokkos::RangePolicy< ExecSpace > range( 0 , capacity );
-      parallel_for( range , *this );
-      ExecSpace::fence();
-    }
-};
-
-template< class OutputView , unsigned Rank = OutputView::Rank >
-struct ViewFill
-{
-  typedef typename OutputView::const_value_type  const_value_type ;
-  typedef typename OutputView::size_type         size_type ;
-
-  const OutputView output ;
-  const_value_type input ;
-
-  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
-    : output( arg_out ), input( arg_in )
-    {
-      typedef typename OutputView::execution_space execution_space ;
-      Kokkos::RangePolicy< execution_space > range( 0 , output.dimension_0() );
-      parallel_for( range , *this );
-      execution_space::fence();
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i0 ) const
-  {
-    for ( size_type i1 = 0 ; i1 < output.dimension_1() ; ++i1 ) {
-    for ( size_type i2 = 0 ; i2 < output.dimension_2() ; ++i2 ) {
-    for ( size_type i3 = 0 ; i3 < output.dimension_3() ; ++i3 ) {
-    for ( size_type i4 = 0 ; i4 < output.dimension_4() ; ++i4 ) {
-    for ( size_type i5 = 0 ; i5 < output.dimension_5() ; ++i5 ) {
-    for ( size_type i6 = 0 ; i6 < output.dimension_6() ; ++i6 ) {
-    for ( size_type i7 = 0 ; i7 < output.dimension_7() ; ++i7 ) {
-      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input ;
-    }}}}}}}
-  }
-};
-
-template< class OutputView >
-struct ViewFill< OutputView , 0 >
-{
-  typedef typename OutputView::const_value_type  const_value_type ;
-  typedef typename OutputView::memory_space      dst_space ;
-
-  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
-  {
-    DeepCopy< dst_space , dst_space >( arg_out.ptr_on_device() , & arg_in ,
-                                       sizeof(const_value_type) );
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-struct ViewAllocateWithoutInitializing {
-
-  const std::string label ;
-
-  ViewAllocateWithoutInitializing() : label() {}
-  ViewAllocateWithoutInitializing( const std::string & arg_label ) : label( arg_label ) {}
-  ViewAllocateWithoutInitializing( const char * const  arg_label ) : label( arg_label ) {}
-};
-
-struct ViewAllocate {
-
-  const std::string  label ;
-
-  ViewAllocate() : label() {}
-  ViewAllocate( const std::string & arg_label ) : label( arg_label ) {}
-  ViewAllocate( const char * const  arg_label ) : label( arg_label ) {}
-};
-
-}
-
-namespace Kokkos {
-namespace Impl {
-
-template< class Traits , class AllocationProperties , class Enable = void >
-struct ViewAllocProp : public Kokkos::Impl::false_type {};
-
-template< class Traits >
-struct ViewAllocProp< Traits , Kokkos::ViewAllocate
-  , typename Kokkos::Impl::enable_if<(
-      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
-    )>::type >
-  : public Kokkos::Impl::true_type
-{
-  typedef size_t               size_type ;
-  typedef const ViewAllocate & property_type ;
-
-  enum { Initialize = true };
-  enum { AllowPadding = false };
-
-  inline
-  static const std::string & label( property_type p ) { return p.label ; }
-};
-
-template< class Traits >
-struct ViewAllocProp< Traits , std::string
-  , typename Kokkos::Impl::enable_if<(
-      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
-    )>::type >
-  : public Kokkos::Impl::true_type
-{
-  typedef size_t              size_type ;
-  typedef const std::string & property_type ;
-
-  enum { Initialize = true };
-  enum { AllowPadding = false };
-
-  inline
-  static const std::string & label( property_type s ) { return s ; }
-};
-
-template< class Traits , unsigned N >
-struct ViewAllocProp< Traits , char[N]
-  , typename Kokkos::Impl::enable_if<(
-      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
-    )>::type >
-  : public Kokkos::Impl::true_type
-{
-private:
-  typedef char label_type[N] ;
-public:
-
-  typedef size_t             size_type ;
-  typedef const label_type & property_type ;
-
-  enum { Initialize = true };
-  enum { AllowPadding = false };
-
-  inline
-  static std::string label( property_type s ) { return std::string(s) ; }
-};
-
-template< class Traits >
-struct ViewAllocProp< Traits , Kokkos::ViewAllocateWithoutInitializing
-  , typename Kokkos::Impl::enable_if<(
-      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
-    )>::type >
-  : public Kokkos::Impl::true_type
-{
-  typedef size_t size_type ;
-  typedef const Kokkos::ViewAllocateWithoutInitializing & property_type ;
-
-  enum { Initialize = false };
-  enum { AllowPadding = false };
-
-  inline
-  static std::string label( property_type s ) { return s.label ; }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class Traits , class PointerProperties , class Enable = void >
-struct ViewRawPointerProp : public Kokkos::Impl::false_type {};
-
-template< class Traits , typename T >
-struct ViewRawPointerProp< Traits , T ,
-  typename Kokkos::Impl::enable_if<(
-    Impl::is_same< T , typename Traits::value_type >::value ||
-    Impl::is_same< T , typename Traits::non_const_value_type >::value
-  )>::type >
-  : public Kokkos::Impl::true_type
-{
-  typedef size_t size_type ; 
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_VIEWSUPPORT_HPP */
-
-
--- a/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
@ -1,195 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_VIEWTILELEFT_HPP
-#define KOKKOS_VIEWTILELEFT_HPP
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class T , unsigned N0 , unsigned N1 , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< T , void , LayoutTileLeft<N0,N1> , MemorySpace , MemoryTraits >
-{
-  typedef ViewDefault type ;
-};
-
-struct ViewTile {};
-
-template< class ShapeType , unsigned N0 , unsigned N1 >
-struct ViewOffset< ShapeType
-                 , LayoutTileLeft<N0,N1,true> /* Only accept properly shaped tiles */
-                 , typename Impl::enable_if<( 2 == ShapeType::rank
-                                              &&
-                                              2 == ShapeType::rank_dynamic
-                                            )>::type >
-  : public ShapeType
-{
-  enum { SHIFT_0 = Impl::power_of_two<N0>::value };
-  enum { SHIFT_1 = Impl::power_of_two<N1>::value };
-  enum { MASK_0  = N0 - 1 };
-  enum { MASK_1  = N1 - 1 };
-
-  typedef size_t                      size_type ;
-  typedef ShapeType                   shape_type ;
-  typedef LayoutTileLeft<N0,N1,true>  array_layout ;
-
-  enum { has_padding = true };
-
-  size_type tile_N0 ;
-
-  KOKKOS_INLINE_FUNCTION
-  void assign( const ViewOffset & rhs )
-    {
-      shape_type::N0 = rhs.N0 ;
-      shape_type::N1 = rhs.N1 ;
-      tile_N0 = ( rhs.N0 + MASK_0 ) >> SHIFT_0 ; // number of tiles in first dimension
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void assign( size_t n0 , size_t n1
-             , int = 0 , int = 0
-             , int = 0 , int = 0
-             , int = 0 , int = 0
-             , int = 0
-             )
-    {
-      shape_type::N0 = n0 ;
-      shape_type::N1 = n1 ;
-      tile_N0 = ( n0 + MASK_0 ) >> SHIFT_0 ; // number of tiles in first dimension
-    }
-
-
-  KOKKOS_INLINE_FUNCTION
-  void set_padding() {}
-
-
-  template< typename I0 , typename I1 >
-  KOKKOS_INLINE_FUNCTION
-  size_type operator()( I0 const & i0 , I1 const & i1
-                      , int = 0 , int = 0
-                      , int = 0 , int = 0
-                      , int = 0 , int = 0
-                      ) const
-    {
-      return /* ( ( Tile offset                             ) *  ( Tile size       ) ) */
-                ( ( (i0>>SHIFT_0) + tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) ) +
-             /* ( Offset within tile                       ) */
-                ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ;
-    }
-
-  template< typename I0 , typename I1 >
-  KOKKOS_INLINE_FUNCTION
-  size_type tile_begin( I0 const & i_tile0 , I1 const & i_tile1 ) const
-    {
-      return ( i_tile0 + tile_N0 * i_tile1 ) << ( SHIFT_0 + SHIFT_1 );
-    }
-
-
-  KOKKOS_INLINE_FUNCTION
-  size_type capacity() const
-    {
-      // ( TileDim0 * ( TileDim1 ) ) * TileSize
-      return ( tile_N0 * ( ( shape_type::N1 + MASK_1 ) >> SHIFT_1 ) ) << ( SHIFT_0 + SHIFT_1 );
-    }
-};
-
-template<>
-struct ViewAssignment< ViewTile , void , void >
-{
-  // Some compilers have type-matching issues on the integer values when using:
-  //   template< class T , unsigned N0 , unsigned N1 , class A2 , class A3 >
-  template< class T , unsigned dN0 , unsigned dN1
-          , class A2 , class A3
-          , unsigned sN0 , unsigned sN1 >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment( View< T[dN0][dN1], LayoutLeft, A2, A3, Impl::ViewDefault > & dst
-                , View< T** , LayoutTileLeft<sN0,sN1,true>, A2, A3, Impl::ViewDefault > const & src
-                , size_t const i_tile0
-                , typename Impl::enable_if< unsigned(dN0) == unsigned(sN0) &&
-                                            unsigned(dN1) == unsigned(sN1)
-                                          , size_t const
-                                          >::type i_tile1
-                )
-   {
-     // Destination is always contiguous but source may be non-contiguous
-     // so don't assign the whole view management object.
-     // Just query and appropriately set the reference-count state.
-
-     if ( ! src.m_management.is_managed() ) dst.m_management.set_unmanaged();
-
-     dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map.tile_begin(i_tile0,i_tile1);
-
-     dst.m_management.increment( dst.m_ptr_on_device );
-   }
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-
-template< class T , unsigned N0, unsigned N1, class A2, class A3 >
-KOKKOS_INLINE_FUNCTION
-View< T[N0][N1], LayoutLeft, A2, A3, Impl::ViewDefault >
-tile_subview( const View<T**,LayoutTileLeft<N0,N1,true>,A2,A3,Impl::ViewDefault> & src
-            , const size_t i_tile0
-            , const size_t i_tile1
-            )
-{
-  View< T[N0][N1], LayoutLeft, A2, A3, Impl::ViewDefault > dst ;
-
-  (void) Impl::ViewAssignment< Impl::ViewTile , void , void >( dst , src , i_tile0 , i_tile1 );
-
-  return dst ;
-}
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_VIEWTILELEFT_HPP */
-
--- a/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
@ -1,242 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_VOLATILE_LOAD )
-#define KOKKOS_VOLATILE_LOAD
-
-#if defined( __GNUC__ ) /* GNU C   */ || \
-    defined( __GNUG__ ) /* GNU C++ */ || \
-    defined( __clang__ )
-
-#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__))
-
-#else
-
-#define KOKKOS_MAY_ALIAS
-
-#endif
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-template <typename T>
-KOKKOS_FORCEINLINE_FUNCTION
-T volatile_load(T const volatile * const src_ptr)
-{
-  typedef uint64_t KOKKOS_MAY_ALIAS T64;
-  typedef uint32_t KOKKOS_MAY_ALIAS T32;
-  typedef uint16_t KOKKOS_MAY_ALIAS T16;
-  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
-
-  enum {
-    NUM_8  = sizeof(T),
-    NUM_16 = NUM_8 / 2,
-    NUM_32 = NUM_8 / 4,
-    NUM_64 = NUM_8 / 8
-  };
-
-  union {
-    T   const volatile * const ptr;
-    T64 const volatile * const ptr64;
-    T32 const volatile * const ptr32;
-    T16 const volatile * const ptr16;
-    T8  const volatile * const ptr8;
-  } src = {src_ptr};
-
-  T result;
-
-  union {
-    T   * const ptr;
-    T64 * const ptr64;
-    T32 * const ptr32;
-    T16 * const ptr16;
-    T8  * const ptr8;
-  } dst = {&result};
-
-  for (int i=0; i < NUM_64; ++i) {
-    dst.ptr64[i] = src.ptr64[i];
-  }
-
-  if ( NUM_64*2 < NUM_32 ) {
-    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
-  }
-
-  if ( NUM_32*2 < NUM_16 ) {
-    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
-  }
-
-  if ( NUM_16*2 < NUM_8 ) {
-    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
-  }
-
-  return result;
-}
-
-template <typename T>
-KOKKOS_FORCEINLINE_FUNCTION
-void volatile_store(T volatile * const dst_ptr, T const volatile * const src_ptr)
-{
-  typedef uint64_t KOKKOS_MAY_ALIAS T64;
-  typedef uint32_t KOKKOS_MAY_ALIAS T32;
-  typedef uint16_t KOKKOS_MAY_ALIAS T16;
-  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
-
-  enum {
-    NUM_8  = sizeof(T),
-    NUM_16 = NUM_8 / 2,
-    NUM_32 = NUM_8 / 4,
-    NUM_64 = NUM_8 / 8
-  };
-
-  union {
-    T   const volatile * const ptr;
-    T64 const volatile * const ptr64;
-    T32 const volatile * const ptr32;
-    T16 const volatile * const ptr16;
-    T8  const volatile * const ptr8;
-  } src = {src_ptr};
-
-  union {
-    T   volatile * const ptr;
-    T64 volatile * const ptr64;
-    T32 volatile * const ptr32;
-    T16 volatile * const ptr16;
-    T8  volatile * const ptr8;
-  } dst = {dst_ptr};
-
-  for (int i=0; i < NUM_64; ++i) {
-    dst.ptr64[i] = src.ptr64[i];
-  }
-
-  if ( NUM_64*2 < NUM_32 ) {
-    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
-  }
-
-  if ( NUM_32*2 < NUM_16 ) {
-    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
-  }
-
-  if ( NUM_16*2 < NUM_8 ) {
-    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
-  }
-}
-
-template <typename T>
-KOKKOS_FORCEINLINE_FUNCTION
-void volatile_store(T volatile * const dst_ptr, T const * const src_ptr)
-{
-  typedef uint64_t KOKKOS_MAY_ALIAS T64;
-  typedef uint32_t KOKKOS_MAY_ALIAS T32;
-  typedef uint16_t KOKKOS_MAY_ALIAS T16;
-  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
-
-  enum {
-    NUM_8  = sizeof(T),
-    NUM_16 = NUM_8 / 2,
-    NUM_32 = NUM_8 / 4,
-    NUM_64 = NUM_8 / 8
-  };
-
-  union {
-    T   const * const ptr;
-    T64 const * const ptr64;
-    T32 const * const ptr32;
-    T16 const * const ptr16;
-    T8  const * const ptr8;
-  } src = {src_ptr};
-
-  union {
-    T   volatile * const ptr;
-    T64 volatile * const ptr64;
-    T32 volatile * const ptr32;
-    T16 volatile * const ptr16;
-    T8  volatile * const ptr8;
-  } dst = {dst_ptr};
-
-  for (int i=0; i < NUM_64; ++i) {
-    dst.ptr64[i] = src.ptr64[i];
-  }
-
-  if ( NUM_64*2 < NUM_32 ) {
-    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
-  }
-
-  if ( NUM_32*2 < NUM_16 ) {
-    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
-  }
-
-  if ( NUM_16*2 < NUM_8 ) {
-    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
-  }
-}
-
-template <typename T>
-KOKKOS_FORCEINLINE_FUNCTION
-void volatile_store(T volatile * dst_ptr, T const volatile & src)
-{ volatile_store(dst_ptr, &src); }
-
-template <typename T>
-KOKKOS_FORCEINLINE_FUNCTION
-void volatile_store(T volatile * dst_ptr, T const & src)
-{ volatile_store(dst_ptr, &src); }
-
-template <typename T>
-KOKKOS_FORCEINLINE_FUNCTION
-T safe_load(T const * const ptr)
-{
-#if !defined( __MIC__ )
-  return *ptr;
-#else
-  return volatile_load(ptr);
-#endif
-}
-
-} // namespace kokkos
-
-#undef KOKKOS_MAY_ALIAS
-
-#endif
-
-
-
--- a/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
@ -1,704 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#define DEBUG_PRINT 0
-
-#include <iostream>
-#include <sstream>
-
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_hwloc.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace hwloc {
-
-/* Return 0 if asynchronous, 1 if synchronous and include process. */
-unsigned thread_mapping( const char * const label ,
-                         const bool allow_async ,
-                         unsigned & thread_count ,
-                         unsigned & use_numa_count ,
-                         unsigned & use_cores_per_numa ,
-                         std::pair<unsigned,unsigned> threads_coord[] )
-{
-  const bool     hwloc_avail            = Kokkos::hwloc::available();
-  const unsigned avail_numa_count       = hwloc_avail ? hwloc::get_available_numa_count() : 1 ;
-  const unsigned avail_cores_per_numa   = hwloc_avail ? hwloc::get_available_cores_per_numa() : thread_count ;
-  const unsigned avail_threads_per_core = hwloc_avail ? hwloc::get_available_threads_per_core() : 1 ;
-
-  // (numa,core) coordinate of the process:
-  const std::pair<unsigned,unsigned> proc_coord = Kokkos::hwloc::get_this_thread_coordinate();
-
-  //------------------------------------------------------------------------
-  // Defaults for unspecified inputs:
-
-  if ( ! use_numa_count ) {
-    // Default to use all NUMA regions
-    use_numa_count = ! thread_count ? avail_numa_count : (
-                       thread_count < avail_numa_count ? thread_count : avail_numa_count );
-  }
-
-  if ( ! use_cores_per_numa ) {
-    // Default to use all but one core if asynchronous, all cores if synchronous.
-    const unsigned threads_per_numa = thread_count / use_numa_count ;
-
-    use_cores_per_numa = ! threads_per_numa ? avail_cores_per_numa - ( allow_async ? 1 : 0 ) : (
-                           threads_per_numa < avail_cores_per_numa ? threads_per_numa : avail_cores_per_numa );
-  }
-
-  if ( ! thread_count ) {
-    thread_count = use_numa_count * use_cores_per_numa * avail_threads_per_core ;
-  }
-
-  //------------------------------------------------------------------------
-  // Input verification:
-
-  const bool valid_numa      = use_numa_count <= avail_numa_count ;
-  const bool valid_cores     = use_cores_per_numa &&
-                               use_cores_per_numa <= avail_cores_per_numa ;
-  const bool valid_threads   = thread_count &&
-                               thread_count <= use_numa_count * use_cores_per_numa * avail_threads_per_core ;
-  const bool balanced_numa   = ! ( thread_count % use_numa_count );
-  const bool balanced_cores  = ! ( thread_count % ( use_numa_count * use_cores_per_numa ) );
-
-  const bool valid_input = valid_numa && valid_cores && valid_threads && balanced_numa && balanced_cores ;
-
-  if ( ! valid_input ) {
-
-    std::ostringstream msg ;
-
-    msg << label << " HWLOC ERROR(s)" ;
-
-    if ( ! valid_threads ) {
-      msg << " : thread_count(" << thread_count
-          << ") exceeds capacity("
-          << use_numa_count * use_cores_per_numa * avail_threads_per_core
-          << ")" ;
-    }
-    if ( ! valid_numa ) {
-      msg << " : use_numa_count(" << use_numa_count
-          << ") exceeds capacity(" << avail_numa_count << ")" ;
-    }
-    if ( ! valid_cores ) {
-      msg << " : use_cores_per_numa(" << use_cores_per_numa
-          << ") exceeds capacity(" << avail_cores_per_numa << ")" ;
-    }
-    if ( ! balanced_numa ) {
-      msg << " : thread_count(" << thread_count
-          << ") imbalanced among numa(" << use_numa_count << ")" ;
-    }
-    if ( ! balanced_cores ) {
-      msg << " : thread_count(" << thread_count
-          << ") imbalanced among cores(" << use_numa_count * use_cores_per_numa << ")" ;
-    }
-
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-
-  const unsigned thread_spawn_synchronous =
-    ( allow_async &&
-      1 < thread_count &&
-      ( use_numa_count     < avail_numa_count ||
-        use_cores_per_numa < avail_cores_per_numa ) )
-     ? 0 /* asyncronous */
-     : 1 /* synchronous, threads_coord[0] is process core */ ;
-
-  // Determine binding coordinates for to-be-spawned threads so that
-  // threads may be bound to cores as they are spawned.
-
-  const unsigned threads_per_core = thread_count / ( use_numa_count * use_cores_per_numa );
-
-  if ( thread_spawn_synchronous ) {
-    // Working synchronously and include process core as threads_coord[0].
-    // Swap the NUMA coordinate of the process core with 0
-    // Swap the CORE coordinate of the process core with 0
-    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
-      const unsigned numa_coord = 0 == inuma ? proc_coord.first : ( proc_coord.first == inuma ? 0 : inuma );
-      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
-        const unsigned core_coord = 0 == icore ? proc_coord.second : ( proc_coord.second == icore ? 0 : icore );
-        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
-          threads_coord[i].first  = numa_coord ;
-          threads_coord[i].second = core_coord ;
-        }
-      }
-    }
-  }
-  else if ( use_numa_count < avail_numa_count ) {
-    // Working asynchronously and omit the process' NUMA region from the pool.
-    // Swap the NUMA coordinate of the process core with ( ( avail_numa_count - use_numa_count ) - 1 )
-    const unsigned numa_coord_swap = ( avail_numa_count - use_numa_count ) - 1 ;
-    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
-      const unsigned numa_coord = proc_coord.first == inuma ? numa_coord_swap : inuma ;
-      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
-        const unsigned core_coord = icore ;
-        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
-          threads_coord[i].first  = numa_coord ;
-          threads_coord[i].second = core_coord ;
-        }
-      }
-    }
-  }
-  else if ( use_cores_per_numa < avail_cores_per_numa ) {
-    // Working asynchronously and omit the process' core from the pool.
-    // Swap the CORE coordinate of the process core with ( ( avail_cores_per_numa - use_cores_per_numa ) - 1 )
-    const unsigned core_coord_swap = ( avail_cores_per_numa - use_cores_per_numa ) - 1 ;
-    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
-      const unsigned numa_coord = inuma ;
-      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
-        const unsigned core_coord = proc_coord.second == icore ? core_coord_swap : icore ;
-        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
-          threads_coord[i].first  = numa_coord ;
-          threads_coord[i].second = core_coord ;
-        }
-      }
-    }
-  }
-
-  return thread_spawn_synchronous ;
-}
-
-} /* namespace hwloc */
-} /* namespace Kokkos */
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-#if defined( KOKKOS_HAVE_HWLOC )
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-/*--------------------------------------------------------------------------*/
-/* Third Party Libraries */
-
-/* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */
-#include <hwloc.h>
-
-#define  REQUIRED_HWLOC_API_VERSION  0x000010300
-
-#if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION
-#error "Requires  http://www.open-mpi.org/projects/hwloc/  Version 1.3 or greater"
-#endif
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace hwloc {
-namespace {
-
-#if DEBUG_PRINT
-
-inline
-void print_bitmap( std::ostream & s , const hwloc_const_bitmap_t bitmap )
-{
-  s << "{" ;
-  for ( int i = hwloc_bitmap_first( bitmap ) ;
-        -1 != i ; i = hwloc_bitmap_next( bitmap , i ) ) {
-    s << " " << i ;
-  }
-  s << " }" ;
-}
-
-#endif
-
-enum { MAX_CORE = 1024 };
-
-std::pair<unsigned,unsigned> s_core_topology(0,0);
-unsigned                     s_core_capacity(0);
-hwloc_topology_t             s_hwloc_topology(0);
-hwloc_bitmap_t               s_hwloc_location(0);
-hwloc_bitmap_t               s_process_binding(0);
-hwloc_bitmap_t               s_core[ MAX_CORE ];
-
-struct Sentinel {
-  ~Sentinel();
-  Sentinel();
-};
-
-bool sentinel()
-{
-  static Sentinel self ;
-
-  if ( 0 == s_hwloc_topology ) {
-    std::cerr << "Kokkos::hwloc ERROR : Called after return from main()" << std::endl ;
-    std::cerr.flush();
-  }
-
-  return 0 != s_hwloc_topology ;
-}
-
-Sentinel::~Sentinel()
-{
-  hwloc_topology_destroy( s_hwloc_topology );
-  hwloc_bitmap_free( s_process_binding );
-  hwloc_bitmap_free( s_hwloc_location );
-
-  s_core_topology.first  = 0 ;
-  s_core_topology.second = 0 ;
-  s_core_capacity   = 0 ;
-  s_hwloc_topology  = 0 ;
-  s_hwloc_location  = 0 ;
-  s_process_binding = 0 ;
-}
-
-Sentinel::Sentinel()
-{
-#if defined(__MIC__)
-  static const bool remove_core_0 = true ;
-#else
-  static const bool remove_core_0 = false ;
-#endif
-
-  s_core_topology   = std::pair<unsigned,unsigned>(0,0);
-  s_core_capacity   = 0 ;
-  s_hwloc_topology  = 0 ;
-  s_hwloc_location  = 0 ;
-  s_process_binding = 0 ;
-
-  for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ;
-
-  hwloc_topology_init( & s_hwloc_topology );
-  hwloc_topology_load( s_hwloc_topology );
-
-  s_hwloc_location  = hwloc_bitmap_alloc();
-  s_process_binding = hwloc_bitmap_alloc();
-
-  hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
-
-  if ( remove_core_0 ) {
-
-    const hwloc_obj_t core = hwloc_get_obj_by_type( s_hwloc_topology , HWLOC_OBJ_CORE , 0 );
-
-    if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
-
-      hwloc_bitmap_t s_process_no_core_zero = hwloc_bitmap_alloc();
-
-      hwloc_bitmap_andnot( s_process_no_core_zero , s_process_binding , core->allowed_cpuset );
-
-      bool ok = 0 == hwloc_set_cpubind( s_hwloc_topology ,
-                                        s_process_no_core_zero ,
-                                        HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT );
-
-      if ( ok ) {
-        hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
-
-        ok = 0 != hwloc_bitmap_isequal( s_process_binding , s_process_no_core_zero );
-      }
-
-      hwloc_bitmap_free( s_process_no_core_zero );
-
-      if ( ! ok ) {
-        std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
-      }
-    }
-  }
-
-  // Choose a hwloc object type for the NUMA level, which may not exist.
-
-  hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ;
-
-  {
-    // Object types to search, in order.
-    static const hwloc_obj_type_t candidate_root_type[] =
-      { HWLOC_OBJ_NODE     /* NUMA region     */
-      , HWLOC_OBJ_SOCKET   /* hardware socket */
-      , HWLOC_OBJ_MACHINE  /* local machine   */
-      };
-
-    enum { CANDIDATE_ROOT_TYPE_COUNT =
-             sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) };
-
-    for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) {
-      if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) {
-        root_type = candidate_root_type[k] ;
-      }
-    }
-  }
-
-  // Determine which of these 'root' types are available to this process.
-  // The process may have been bound (e.g., by MPI) to a subset of these root types.
-  // Determine current location of the master (calling) process>
-
-  hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc();
-
-  hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD );
-
-  const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type );
-
-  unsigned root_base     = max_root ;
-  unsigned root_count    = 0 ;
-  unsigned core_per_root = 0 ;
-  unsigned pu_per_core   = 0 ;
-  bool     symmetric     = true ;
-
-  for ( unsigned i = 0 ; i < max_root ; ++i ) {
-
-    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i );
-
-    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
-
-      ++root_count ;
-
-      // Remember which root (NUMA) object the master thread is running on.
-      // This will be logical NUMA rank #0 for this process.
-
-      if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) {
-        root_base = i ;
-      }
-
-      // Count available cores:
-
-      const unsigned max_core =
-        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
-                                                root->allowed_cpuset ,
-                                                HWLOC_OBJ_CORE );
-
-      unsigned core_count = 0 ;
-
-      for ( unsigned j = 0 ; j < max_core ; ++j ) {
-
-        const hwloc_obj_t core =
-          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
-                                               root->allowed_cpuset ,
-                                               HWLOC_OBJ_CORE , j );
-
-        // If process' cpuset intersects core's cpuset then process can access this core.
-        // Must use intersection instead of inclusion because the Intel-Phi
-        // MPI may bind the process to only one of the core's hyperthreads.
-        //
-        // Assumption: if the process can access any hyperthread of the core
-        // then it has ownership of the entire core.
-        // This assumes that it would be performance-detrimental
-        // to spawn more than one MPI process per core and use nested threading.
-
-        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
-
-          ++core_count ;
-
-          const unsigned pu_count =
-            hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
-                                                    core->allowed_cpuset ,
-                                                    HWLOC_OBJ_PU );
-
-          if ( pu_per_core == 0 ) pu_per_core = pu_count ;
-
-          // Enforce symmetry by taking the minimum:
-
-          pu_per_core = std::min( pu_per_core , pu_count );
-
-          if ( pu_count != pu_per_core ) symmetric = false ;
-        }
-      }
-
-      if ( 0 == core_per_root ) core_per_root = core_count ;
-
-      // Enforce symmetry by taking the minimum:
-
-      core_per_root = std::min( core_per_root , core_count );
-
-      if ( core_count != core_per_root ) symmetric = false ;
-    }
-  }
-
-  s_core_topology.first  = root_count ;
-  s_core_topology.second = core_per_root ;
-  s_core_capacity        = pu_per_core ;
-
-  // Fill the 's_core' array for fast mapping from a core coordinate to the
-  // hwloc cpuset object required for thread location querying and binding.
-
-  for ( unsigned i = 0 ; i < max_root ; ++i ) {
-
-    const unsigned root_rank = ( i + root_base ) % max_root ;
-
-    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank );
-
-    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
-
-      const unsigned max_core =
-        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
-                                                root->allowed_cpuset ,
-                                                HWLOC_OBJ_CORE );
-
-      unsigned core_count = 0 ;
-
-      for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) {
-
-        const hwloc_obj_t core =
-          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
-                                               root->allowed_cpuset ,
-                                               HWLOC_OBJ_CORE , j );
-
-        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
-
-          s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ;
-
-          ++core_count ;
-        }
-      }
-    }
-  }
-
-  hwloc_bitmap_free( proc_cpuset_location );
-
-  if ( ! symmetric ) {
-    std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
-              << std::endl ;
-  }
-}
-
-
-} // namespace
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-bool available()
-{ return true ; }
-
-unsigned get_available_numa_count()
-{ sentinel(); return s_core_topology.first ; }
-
-unsigned get_available_cores_per_numa()
-{ sentinel(); return s_core_topology.second ; }
-
-unsigned get_available_threads_per_core()
-{ sentinel(); return s_core_capacity ; }
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-unsigned bind_this_thread(
-  const unsigned               coordinate_count ,
-  std::pair<unsigned,unsigned> coordinate[] )
-{
-  unsigned i = 0 ;
-
-  try {
-    const std::pair<unsigned,unsigned> current = get_this_thread_coordinate();
-
-    // Match one of the requests:
-    for ( i = 0 ; i < coordinate_count && current != coordinate[i] ; ++i );
-
-    if ( coordinate_count == i ) {
-      // Match the first request (typically NUMA):
-      for ( i = 0 ; i < coordinate_count && current.first != coordinate[i].first ; ++i );
-    }
-
-    if ( coordinate_count == i ) {
-      // Match any unclaimed request:
-      for ( i = 0 ; i < coordinate_count && ~0u == coordinate[i].first  ; ++i );
-    }
-
-    if ( coordinate_count == i || ! bind_this_thread( coordinate[i] ) ) {
-       // Failed to bind:
-       i = ~0u ;
-    }
-
-    if ( i < coordinate_count ) {
-
-#if DEBUG_PRINT
-      if ( current != coordinate[i] ) {
-        std::cout << "  bind_this_thread: rebinding from ("
-                  << current.first << ","
-                  << current.second
-                  << ") to ("
-                  << coordinate[i].first << ","
-                  << coordinate[i].second
-                  << ")" << std::endl ;
-      }
-#endif
-
-      coordinate[i].first  = ~0u ;
-      coordinate[i].second = ~0u ;
-    }
-  }
-  catch( ... ) {
-    i = ~0u ;
-  }
-
-  return i ;
-}
-
-
-bool bind_this_thread( const std::pair<unsigned,unsigned> coord )
-{
-  if ( ! sentinel() ) return false ;
-
-#if DEBUG_PRINT
-
-  std::cout << "Kokkos::bind_this_thread() at " ;
-
-  hwloc_get_last_cpu_location( s_hwloc_topology ,
-                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
-
-  print_bitmap( std::cout , s_hwloc_location );
-
-  std::cout << " to " ;
-
-  print_bitmap( std::cout , s_core[ coord.second + coord.first * s_core_topology.second ] );
-
-  std::cout << std::endl ;
-
-#endif
-
-  // As safe and fast as possible.
-  // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'.
-  return coord.first  < s_core_topology.first &&
-         coord.second < s_core_topology.second &&
-         0 == hwloc_set_cpubind( s_hwloc_topology ,
-                                 s_core[ coord.second + coord.first * s_core_topology.second ] ,
-                                 HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
-}
-
-bool unbind_this_thread()
-{
-  if ( ! sentinel() ) return false ;
-
-#define HWLOC_DEBUG_PRINT 0
-
-#if HWLOC_DEBUG_PRINT
-
-  std::cout << "Kokkos::unbind_this_thread() from " ;
-
-  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
-
-  print_bitmap( std::cout , s_hwloc_location );
-
-#endif
-
-  const bool result =
-    s_hwloc_topology &&
-    0 == hwloc_set_cpubind( s_hwloc_topology ,
-                            s_process_binding ,
-                            HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
-
-#if HWLOC_DEBUG_PRINT
-
-  std::cout << " to " ;
-
-  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
-
-  print_bitmap( std::cout , s_hwloc_location );
-
-  std::cout << std::endl ;
-
-#endif
-
-  return result ;
-
-#undef HWLOC_DEBUG_PRINT
-
-}
-
-//----------------------------------------------------------------------------
-
-std::pair<unsigned,unsigned> get_this_thread_coordinate()
-{
-  std::pair<unsigned,unsigned> coord(0u,0u);
-
-  if ( ! sentinel() ) return coord ;
-
-  const unsigned n = s_core_topology.first * s_core_topology.second ;
-
-  // Using the pre-allocated 's_hwloc_location' to avoid memory
-  // allocation by this thread.  This call is NOT thread-safe.
-  hwloc_get_last_cpu_location( s_hwloc_topology ,
-                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
-
-  unsigned i = 0 ;
-
-  while ( i < n && ! hwloc_bitmap_intersects( s_hwloc_location , s_core[ i ] ) ) ++i ;
-
-  if ( i < n ) {
-    coord.first  = i / s_core_topology.second ;
-    coord.second = i % s_core_topology.second ;
-  }
-
-  return coord ;
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace hwloc */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#else /* ! defined( KOKKOS_HAVE_HWLOC ) */
-
-namespace Kokkos {
-namespace hwloc {
-
-bool available() { return false ; }
-
-unsigned get_available_numa_count() { return 1 ; }
-unsigned get_available_cores_per_numa() { return 1 ; }
-unsigned get_available_threads_per_core() { return 1 ; }
-
-unsigned bind_this_thread( const unsigned , std::pair<unsigned,unsigned>[] )
-{ return ~0 ; }
-
-bool bind_this_thread( const std::pair<unsigned,unsigned> )
-{ return false ; }
-
-bool unbind_this_thread()
-{ return true ; }
-
-std::pair<unsigned,unsigned> get_this_thread_coordinate()
-{ return std::pair<unsigned,unsigned>(0,0); }
-
-} // namespace hwloc
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
-
-
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
@ -1,80 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Macros.hpp>
-#include <impl/Kokkos_spinwait.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-#if ( KOKKOS_ENABLE_ASM )
-  #if defined( __arm__ )
-    /* No-operation instruction to idle the thread. */
-    #define YIELD   asm volatile("nop")
-  #else
-    /* Pause instruction to prevent excess processor bus usage */
-    #define YIELD   asm volatile("pause\n":::"memory")
-  #endif
-#elif defined( KOKKOS_HAVE_WINTHREAD )
-  #include <process.h>
-  #define YIELD  Sleep(0)
-#else
-  #include <sched.h>
-  #define YIELD  sched_yield()
-#endif
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void spinwait( volatile int & flag , const int value )
-{
-  while ( value == flag ) {
-    YIELD ;
-  }
-}
-#endif
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
@ -1,64 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-
-#ifndef KOKKOS_SPINWAIT_HPP
-#define KOKKOS_SPINWAIT_HPP
-
-#include <Kokkos_Macros.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void spinwait( volatile int & flag , const int value );
-#else
-KOKKOS_INLINE_FUNCTION
-void spinwait( volatile int & , const int ) {}
-#endif
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_SPINWAIT_HPP */
-