Kokkos lib update

2016-09-08 13:56:18 -06:00
parent 0252347d43
commit 236ebf7fab
212 changed files with 18902 additions and 13466 deletions
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -1,4 +1,15 @@
 IF(COMMAND TRIBITS_PACKAGE_DECL)
  SET(KOKKOS_HAS_TRILINOS ON CACHE BOOL "")
 ELSE()
  SET(KOKKOS_HAS_TRILINOS OFF CACHE BOOL "")
 ENDIF()
 IF(NOT KOKKOS_HAS_TRILINOS)
  CMAKE_MINIMUM_REQUIRED(VERSION 2.8.11 FATAL_ERROR)
  INCLUDE(cmake/tribits.cmake)
 ENDIF()
 #
 # A) Forward delcare the package so that certain options are also defined for
 # subpackages
@ -12,7 +23,22 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
 # subpackages as well.
 #
-TRIBITS_ADD_DEBUG_OPTION()
+
 # mfh 01 Aug 2016: See Issue #61:
 #
 # https://github.com/kokkos/kokkos/issues/61
 #
 # Don't use TRIBITS_ADD_DEBUG_OPTION() here, because that defines
 # HAVE_KOKKOS_DEBUG.  We define KOKKOS_HAVE_DEBUG here instead,
 # for compatibility with Kokkos' Makefile build system.
 TRIBITS_ADD_OPTION_AND_DEFINE(
  ${PACKAGE_NAME}_ENABLE_DEBUG
  ${PACKAGE_NAME_UC}_HAVE_DEBUG
  "Enable run-time debug checks.  These checks may be expensive, so they are disabled by default in a release build."
  ${${PROJECT_NAME}_ENABLE_DEBUG}
 )
 TRIBITS_ADD_OPTION_AND_DEFINE(
  Kokkos_ENABLE_SIERRA_BUILD
@ -82,11 +108,33 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
  "${TPL_ENABLE_MPI}"
  )
 # Set default value of Kokkos_ENABLE_Debug_Bounds_Check option
 #
 # CMake is case sensitive.  The Kokkos_ENABLE_Debug_Bounds_Check
 # option (defined below) is annoyingly not all caps, but we need to
 # keep it that way for backwards compatibility.  If users forget and
 # try using an all-caps variable, then make it count by using the
 # all-caps version as the default value of the original, not-all-caps
 # option.  Otherwise, the default value of this option comes from
 # Kokkos_ENABLE_DEBUG (see Issue #367).
 ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_DEBUG)
 IF(DEFINED Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
  IF(Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
    SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT ON)
  ELSE()
    SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
  ENDIF()
 ELSE()
  SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
 ENDIF()
 ASSERT_DEFINED(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT)
 TRIBITS_ADD_OPTION_AND_DEFINE(
  Kokkos_ENABLE_Debug_Bounds_Check
  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-  "Enable bounds checking support in Kokkos."
+  "Enable Kokkos::View run-time bounds checking."
-  OFF
+  "${Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT}"
  )
 TRIBITS_ADD_OPTION_AND_DEFINE(
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -7,7 +7,7 @@ CXXFLAGS=$(CCFLAGS)
 #Options: OpenMP,Serial,Pthreads,Cuda
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthreads"
-#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8,KNL
+#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW
 KOKKOS_ARCH ?= ""
 #Options: yes,no
 KOKKOS_DEBUG ?= "no"
@ -97,6 +97,7 @@ KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda |
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
 #NVIDIA based
@ -108,10 +109,12 @@ KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@ -123,6 +126,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@ -142,11 +146,11 @@ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AM
 #Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_AVX       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
-KOKKOS_INTERNAL_USE_ARCH_AVX2      := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
+KOKKOS_INTERNAL_USE_ARCH_AVX2      := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 # Decide what ISA level we are able to support
-KOKKOS_INTERNAL_USE_ISA_X86_64     := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
+KOKKOS_INTERNAL_USE_ISA_X86_64     := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 KOKKOS_INTERNAL_USE_ISA_KNC        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc ))
@ -304,8 +308,8 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
    tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += -mcpu=power8
+	KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
-	KOKKOS_LDFLAGS  += -mcpu=power8
+	KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
@ -321,8 +325,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
 			else
 				# Assume that this is a really a GNU compiler
-				KOKKOS_CXXFLAGS += -march=core-avx2
+				KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
-				KOKKOS_LDFLAGS  += -march=core-avx2
+				KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2
 			endif
 		endif
 	endif
@ -390,6 +394,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -arch=sm_53
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
        KOKKOS_CXXFLAGS += -arch=sm_61
 endif
 endif
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -1,9 +1,5 @@
 Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
 Kokkos_AllocationTracker.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
 Kokkos_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
 Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
 Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
@ -20,6 +16,10 @@ Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Seria
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
 Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
 Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
 Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
 Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
@ -32,12 +32,12 @@ Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_M
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 Kokkos_Cuda_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
 Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
 Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
 Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
 endif
@ -61,6 +61,8 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
 Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 endif
 Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
--- a/lib/kokkos/README
+++ b/lib/kokkos/README
@ -37,7 +37,7 @@ hcedwar(at)sandia.gov and crtrott(at)sandia.gov
 ====Requirements============================================================
 ============================================================================
-Primary tested compilers are:
+Primary tested compilers on X86 are:
  GCC 4.7.2
  GCC 4.8.4
  GCC 4.9.2
@ -48,26 +48,43 @@ Primary tested compilers are:
  Clang 3.5.2
  Clang 3.6.1
 Primary tested compilers on Power 8 are:
  IBM XL 13.1.3 (OpenMP,Serial)
  GCC 4.9.2 (OpenMP,Serial)
  GCC 5.3.0 (OpenMP,Serial)
 Secondary tested compilers are:
  CUDA 6.5 (with gcc 4.7.2)
  CUDA 7.0 (with gcc 4.7.2)
  CUDA 7.5 (with gcc 4.8.4)
 Other compilers working:
  X86:
   Intel 17.0.042 (the FENL example causes internal compiler error)
   PGI 15.4
  IBM XL 13.1.2
   Cygwin 2.1.0 64bit with gcc 4.9.3
  KNL:
   Intel 16.2.181 (the FENL example causes internal compiler error)
   Intel 17.0.042 (the FENL example causes internal compiler error)
 Known non-working combinations:
  Power8:
   GCC 6.1.0
   Pthreads backend
 Primary tested compiler are passing in release mode
-with warnings as errors. We are using the following set
+with warnings as errors. They also are tested with a comprehensive set of 
-of flags:
+backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...).
 We are using the following set of flags:
 GCC:   -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
       -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
 Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
 Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
 Secondary compilers are passing without -Werror.
-Other compilers are tested occasionally.
+Other compilers are tested occasionally, in particular when pushing from develop to 
 master branch, without -Werror and only for a select set of backends.
 ============================================================================
 ====Getting started=========================================================
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -771,6 +771,7 @@ namespace Kokkos {
    friend class Random_XorShift1024_Pool<DeviceType>;
  public:
    typedef Random_XorShift1024_Pool<DeviceType> pool_type;
    typedef DeviceType device_type;
    enum {MAX_URAND = 0xffffffffU};
@ -779,10 +780,10 @@ namespace Kokkos {
    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
    KOKKOS_INLINE_FUNCTION
-    Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
      p_(p),state_idx_(state_idx){
      for(int i=0 ; i<16; i++)
-        state_[i] = state[i];
+        state_[i] = state(state_idx,i);
    }
    KOKKOS_INLINE_FUNCTION
@ -933,6 +934,7 @@ namespace Kokkos {
    state_data_type state_;
    int_view_type p_;
    int num_states_;
    friend class Random_XorShift1024<DeviceType>;
  public:
    typedef Random_XorShift1024<DeviceType> generator_type;
@ -1001,7 +1003,7 @@ namespace Kokkos {
    KOKKOS_INLINE_FUNCTION
    Random_XorShift1024<DeviceType> get_state() const {
      const int i = DeviceType::hardware_thread_id();
-      return Random_XorShift1024<DeviceType>(&state_(i,0),p_(i),i);
+      return Random_XorShift1024<DeviceType>(state_,p_(i),i);
    };
    KOKKOS_INLINE_FUNCTION
@ -1020,10 +1022,12 @@ namespace Kokkos {
    int p_;
    const int state_idx_;
    uint64_t* state_;
    const int stride_;
    friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
  public:
    typedef Kokkos::Cuda device_type;
    typedef Random_XorShift1024_Pool<device_type> pool_type;
    enum {MAX_URAND = 0xffffffffU};
    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
@ -1031,30 +1035,30 @@ namespace Kokkos {
    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
    KOKKOS_INLINE_FUNCTION
-    Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
-      p_(p),state_idx_(state_idx),state_(state){
+      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
    }
    KOKKOS_INLINE_FUNCTION
    uint32_t urand() {
-      uint64_t state_0 = state_[ p_ ];
+      uint64_t state_0 = state_[ p_ * stride_ ];
-      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
      state_1 ^= state_1 << 31;
      state_1 ^= state_1 >> 11;
      state_0 ^= state_0 >> 30;
-      uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
      tmp = tmp>>16;
      return static_cast<uint32_t>(tmp&MAX_URAND);
    }
    KOKKOS_INLINE_FUNCTION
    uint64_t urand64() {
-      uint64_t state_0 = state_[ p_ ];
+      uint64_t state_0 = state_[ p_ * stride_ ];
-      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
      state_1 ^= state_1 << 31;
      state_1 ^= state_1 >> 11;
      state_0 ^= state_0 >> 30;
-      return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
    }
    KOKKOS_INLINE_FUNCTION
@ -1227,9 +1231,9 @@ Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_st
      if(i>=num_states_) {i = i_offset;}
  }
-  return Random_XorShift1024<Kokkos::Cuda>(&state_(i,0), p_(i), i);
+  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
 #else
-  return Random_XorShift1024<Kokkos::Cuda>(&state_(0,0), p_(0), 0);
+  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
 #endif
 }
@ -1248,14 +1252,15 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
 #endif
 namespace Impl {
-template<class ViewType, class RandomPool, int loops, int rank>
+template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
 struct fill_random_functor_range;
-template<class ViewType, class RandomPool, int loops, int rank>
+template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
 struct fill_random_functor_begin_end;
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
+struct fill_random_functor_range<ViewType,RandomPool,loops,1,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1268,19 +1273,19 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
    a(a_),rand_pool(rand_pool_),range(range_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (const IndexType& i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0())
+      if(idx<static_cast<IndexType>(a.dimension_0()))
        a(idx) = Rand::draw(gen,range);
    }
    rand_pool.free_state(gen);
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
+struct fill_random_functor_range<ViewType,RandomPool,loops,2,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1293,12 +1298,12 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
    a(a_),rand_pool(rand_pool_),range(range_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
          a(idx,k) = Rand::draw(gen,range);
      }
    }
@ -1307,8 +1312,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
+struct fill_random_functor_range<ViewType,RandomPool,loops,3,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1321,13 +1326,13 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
    a(a_),rand_pool(rand_pool_),range(range_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
            a(idx,k,l) = Rand::draw(gen,range);
      }
    }
@ -1335,8 +1340,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
+struct fill_random_functor_range<ViewType,RandomPool,loops,4, IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1349,14 +1354,14 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
    a(a_),rand_pool(rand_pool_),range(range_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
              a(idx,k,l,m) = Rand::draw(gen,range);
      }
    }
@ -1364,8 +1369,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
+struct fill_random_functor_range<ViewType,RandomPool,loops,5,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1378,15 +1383,15 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
    a(a_),rand_pool(rand_pool_),range(range_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
              a(idx,k,l,m,n) = Rand::draw(gen,range);
      }
    }
@ -1394,8 +1399,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
+struct fill_random_functor_range<ViewType,RandomPool,loops,6,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1408,16 +1413,16 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
    a(a_),rand_pool(rand_pool_),range(range_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
              a(idx,k,l,m,n,o) = Rand::draw(gen,range);
      }
    }
@ -1425,8 +1430,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
+struct fill_random_functor_range<ViewType,RandomPool,loops,7,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1439,17 +1444,17 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
    a(a_),rand_pool(rand_pool_),range(range_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
              a(idx,k,l,m,n,o,p) = Rand::draw(gen,range);
      }
    }
@ -1457,8 +1462,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
+struct fill_random_functor_range<ViewType,RandomPool,loops,8,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1471,26 +1476,26 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
    a(a_),rand_pool(rand_pool_),range(range_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
-                    for(unsigned int q=0;q<a.dimension_7();q++)
+                    for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
              a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range);
      }
    }
    rand_pool.free_state(gen);
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1503,19 +1508,19 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0())
+      if(idx<static_cast<IndexType>(a.dimension_0()))
        a(idx) = Rand::draw(gen,begin,end);
    }
    rand_pool.free_state(gen);
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1528,12 +1533,12 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
          a(idx,k) = Rand::draw(gen,begin,end);
      }
    }
@ -1542,8 +1547,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1556,13 +1561,13 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
            a(idx,k,l) = Rand::draw(gen,begin,end);
      }
    }
@ -1570,8 +1575,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1584,14 +1589,14 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
              a(idx,k,l,m) = Rand::draw(gen,begin,end);
      }
    }
@ -1599,8 +1604,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1613,15 +1618,15 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()){
+      if(idx<static_cast<IndexType>(a.dimension_0())){
-        for(unsigned int l=0;l<a.dimension_1();l++)
+        for(IndexType l=0;l<static_cast<IndexType>(a.dimension_1());l++)
-          for(unsigned int m=0;m<a.dimension_2();m++)
+          for(IndexType m=0;m<static_cast<IndexType>(a.dimension_2());m++)
-            for(unsigned int n=0;n<a.dimension_3();n++)
+            for(IndexType n=0;n<static_cast<IndexType>(a.dimension_3());n++)
-              for(unsigned int o=0;o<a.dimension_4();o++)
+              for(IndexType o=0;o<static_cast<IndexType>(a.dimension_4());o++)
          a(idx,l,m,n,o) = Rand::draw(gen,begin,end);
      }
    }
@ -1629,8 +1634,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1643,16 +1648,16 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
          a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end);
      }
    }
@ -1661,8 +1666,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1675,17 +1680,17 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
            a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end);
      }
    }
@ -1693,8 +1698,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
  }
 };
-template<class ViewType, class RandomPool, int loops>
+template<class ViewType, class RandomPool, int loops, class IndexType>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1707,18 +1712,18 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
+    for(IndexType j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
+      const IndexType idx = i*loops+j;
-      if(idx<a.dimension_0()) {
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
-                    for(unsigned int q=0;q<a.dimension_7();q++)
+                    for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
              a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end);
      }
    }
@ -1726,18 +1731,20 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
  }
 };
-template<class ViewType, class RandomPool>
+}
 template<class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) {
  int64_t LDA = a.dimension_0();
  if(LDA>0)
-    parallel_for((LDA+127)/128,fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank>(a,g,range));
+    parallel_for((LDA+127)/128,Impl::fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,range));
 }
-template<class ViewType, class RandomPool>
+template<class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) {
  int64_t LDA = a.dimension_0();
  if(LDA>0)
-    parallel_for((LDA+127)/128,fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank>(a,g,begin,end));
+    parallel_for((LDA+127)/128,Impl::fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,begin,end));
 }
 }
--- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@ -50,6 +50,7 @@
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <cmath>
 #include <chrono>
 namespace Test {
@ -207,7 +208,6 @@ struct test_histogram1d_functor {
    density_1d (d1d),
    mean (1.0*num_draws/HIST_DIM1D*3)
  {
    printf ("Mean: %e\n", mean);
  }
  KOKKOS_INLINE_FUNCTION void
@ -295,7 +295,7 @@ struct test_random_scalar {
      parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result);
      //printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2);
-      double tolerance = 2.0*sqrt(1.0/num_draws);
+      double tolerance = 1.6*sqrt(1.0/num_draws);
      double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max();
      double variance_expect = 1.0/3.0*mean_expect*mean_expect;
      double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0;
@ -303,10 +303,10 @@ struct test_random_scalar {
      double covariance_eps = result.covariance/num_draws/2/variance_expect;
      pass_mean  = ((-tolerance < mean_eps) &&
                    ( tolerance > mean_eps)) ? 1:0;
-      pass_var   = ((-tolerance < variance_eps) &&
+      pass_var   = ((-1.5*tolerance < variance_eps) &&
-                    ( tolerance > variance_eps)) ? 1:0;
+                    ( 1.5*tolerance > variance_eps)) ? 1:0;
-      pass_covar = ((-1.4*tolerance < covariance_eps) &&
+      pass_covar = ((-2.0*tolerance < covariance_eps) &&
-                    ( 1.4*tolerance > covariance_eps)) ? 1:0;
+                    ( 2.0*tolerance > covariance_eps)) ? 1:0;
      cerr << "Pass: " << pass_mean
           << " " << pass_var
           << " " << mean_eps
@ -328,12 +328,12 @@ struct test_random_scalar {
      double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
      double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
      double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
-      pass_hist1d_mean  = ((-tolerance < mean_eps) &&
+      pass_hist1d_mean  = ((-0.0001 < mean_eps) &&
-                           ( tolerance > mean_eps)) ? 1:0;
+                           ( 0.0001 > mean_eps)) ? 1:0;
-      pass_hist1d_var   = ((-tolerance < variance_eps) &&
+      pass_hist1d_var   = ((-0.07 < variance_eps) &&
-                           ( tolerance > variance_eps)) ? 1:0;
+                           ( 0.07 > variance_eps)) ? 1:0;
-      pass_hist1d_covar = ((-tolerance < covariance_eps) &&
+      pass_hist1d_covar = ((-0.06 < covariance_eps) &&
-                           ( tolerance > covariance_eps)) ? 1:0;
+                           ( 0.06 > covariance_eps)) ? 1:0;
      cerr << "Density 1D: " << mean_eps
           << " " << variance_eps
@ -363,8 +363,8 @@ struct test_random_scalar {
      double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
      pass_hist3d_mean  = ((-tolerance < mean_eps) &&
                           ( tolerance > mean_eps)) ? 1:0;
-      pass_hist3d_var   = ((-tolerance < variance_eps) &&
+      pass_hist3d_var   = ((-1.2*tolerance < variance_eps) &&
-                           ( tolerance > variance_eps)) ? 1:0;
+                           ( 1.2*tolerance > variance_eps)) ? 1:0;
      pass_hist3d_covar = ((-tolerance < covariance_eps) &&
                           ( tolerance > covariance_eps)) ? 1:0;
@ -386,8 +386,13 @@ void test_random(unsigned int num_draws)
  typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
  typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
  uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
  cerr << "Test Seed:" << ticks << endl;
  RandomGenerator pool(ticks);
  cerr << "Test Scalar=int" << endl;
  RandomGenerator pool(31891);
  test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
  ASSERT_EQ( test_int.pass_mean,1);
  ASSERT_EQ( test_int.pass_var,1);
--- a/lib/kokkos/cmake/deps/CUDA.cmake
+++ b/lib/kokkos/cmake/deps/CUDA.cmake
@ -0,0 +1,79 @@
 # @HEADER
 # ************************************************************************
 #
 #            Trilinos: An Object-Oriented Solver Framework
 #                 Copyright (2001) Sandia Corporation
 #
 #
 # Copyright (2001) Sandia Corporation. Under the terms of Contract
 # DE-AC04-94AL85000, there is a non-exclusive license for use of this
 # work by or on behalf of the U.S. Government.  Export of this program
 # may require a license from the United States Government.
 #
 # 1. Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright
 # notice, this list of conditions and the following disclaimer in the
 # documentation and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the Corporation nor the names of the
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # NOTICE:  The United States Government is granted for itself and others
 # acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
 # license in this data to reproduce, prepare derivative works, and
 # perform publicly and display publicly.  Beginning five (5) years from
 # July 25, 2001, the United States Government is granted for itself and
 # others acting on its behalf a paid-up, nonexclusive, irrevocable
 # worldwide license in this data to reproduce, prepare derivative works,
 # distribute copies to the public, perform publicly and display
 # publicly, and to permit others to do so.
 #
 # NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
 # OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
 # ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
 # RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
 # INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
 # THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
 #
 # ************************************************************************
 # @HEADER
 # Check for CUDA support
 SET(_CUDA_FAILURE OFF)
 # Have CMake find CUDA
 IF(NOT _CUDA_FAILURE)
  FIND_PACKAGE(CUDA 3.2)
  IF (NOT CUDA_FOUND)
    SET(_CUDA_FAILURE ON)
  ENDIF()
 ENDIF()
 IF(NOT _CUDA_FAILURE)
  # if we haven't met failure
  macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target)
    TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY)
  endmacro()
  GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS)
  GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
  GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
 ELSE()
  SET(TPL_ENABLE_CUDA OFF)
 ENDIF()
--- a/lib/kokkos/cmake/deps/CUSPARSE.cmake
+++ b/lib/kokkos/cmake/deps/CUSPARSE.cmake
@ -0,0 +1,64 @@
 # @HEADER
 # ************************************************************************
 #
 #            Trilinos: An Object-Oriented Solver Framework
 #                 Copyright (2001) Sandia Corporation
 #
 #
 # Copyright (2001) Sandia Corporation. Under the terms of Contract
 # DE-AC04-94AL85000, there is a non-exclusive license for use of this
 # work by or on behalf of the U.S. Government.  Export of this program
 # may require a license from the United States Government.
 #
 # 1. Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright
 # notice, this list of conditions and the following disclaimer in the
 # documentation and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the Corporation nor the names of the
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # NOTICE:  The United States Government is granted for itself and others
 # acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
 # license in this data to reproduce, prepare derivative works, and
 # perform publicly and display publicly.  Beginning five (5) years from
 # July 25, 2001, the United States Government is granted for itself and
 # others acting on its behalf a paid-up, nonexclusive, irrevocable
 # worldwide license in this data to reproduce, prepare derivative works,
 # distribute copies to the public, perform publicly and display
 # publicly, and to permit others to do so.
 #
 # NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
 # OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
 # ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
 # RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
 # INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
 # THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
 #
 # ************************************************************************
 # @HEADER
 include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
 IF (TPL_ENABLE_CUDA)
  GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
  GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
  GLOBAL_SET(TPL_CUSPARSE_LIBRARIES    ${CUDA_cusparse_LIBRARY})
  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
 ENDIF()
--- a/lib/kokkos/cmake/deps/HWLOC.cmake
+++ b/lib/kokkos/cmake/deps/HWLOC.cmake
@ -0,0 +1,70 @@
 # @HEADER
 # ************************************************************************
 #
 #            Trilinos: An Object-Oriented Solver Framework
 #                 Copyright (2001) Sandia Corporation
 #
 #
 # Copyright (2001) Sandia Corporation. Under the terms of Contract
 # DE-AC04-94AL85000, there is a non-exclusive license for use of this
 # work by or on behalf of the U.S. Government.  Export of this program
 # may require a license from the United States Government.
 #
 # 1. Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright
 # notice, this list of conditions and the following disclaimer in the
 # documentation and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the Corporation nor the names of the
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # NOTICE:  The United States Government is granted for itself and others
 # acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
 # license in this data to reproduce, prepare derivative works, and
 # perform publicly and display publicly.  Beginning five (5) years from
 # July 25, 2001, the United States Government is granted for itself and
 # others acting on its behalf a paid-up, nonexclusive, irrevocable
 # worldwide license in this data to reproduce, prepare derivative works,
 # distribute copies to the public, perform publicly and display
 # publicly, and to permit others to do so.
 #
 # NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
 # OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
 # ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
 # RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
 # INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
 # THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
 #
 # ************************************************************************
 # @HEADER
 #-----------------------------------------------------------------------------
 #  Hardware locality detection and control library.
 #
 #  Acquisition information:
 #    Date checked:  November 2011
 #    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
 #    Source:        http://www.open-mpi.org/projects/hwloc/
 #    Version:       1.3
 #
 TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
  REQUIRED_HEADERS hwloc.h
  REQUIRED_LIBS_NAMES "hwloc"
  )
--- a/lib/kokkos/cmake/deps/Pthread.cmake
+++ b/lib/kokkos/cmake/deps/Pthread.cmake
@ -0,0 +1,83 @@
 # @HEADER
 # ************************************************************************
 #
 #            Trilinos: An Object-Oriented Solver Framework
 #                 Copyright (2001) Sandia Corporation
 #
 #
 # Copyright (2001) Sandia Corporation. Under the terms of Contract
 # DE-AC04-94AL85000, there is a non-exclusive license for use of this
 # work by or on behalf of the U.S. Government.  Export of this program
 # may require a license from the United States Government.
 #
 # 1. Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright
 # notice, this list of conditions and the following disclaimer in the
 # documentation and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the Corporation nor the names of the
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # NOTICE:  The United States Government is granted for itself and others
 # acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
 # license in this data to reproduce, prepare derivative works, and
 # perform publicly and display publicly.  Beginning five (5) years from
 # July 25, 2001, the United States Government is granted for itself and
 # others acting on its behalf a paid-up, nonexclusive, irrevocable
 # worldwide license in this data to reproduce, prepare derivative works,
 # distribute copies to the public, perform publicly and display
 # publicly, and to permit others to do so.
 #
 # NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
 # OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
 # ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
 # RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
 # INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
 # THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
 #
 # ************************************************************************
 # @HEADER
 SET(USE_THREADS FALSE)
 IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
  # Use CMake's Thread finder since it is a bit smarter in determining
  # whether pthreads is already built into the compiler and doesn't need
  # a library to link.
  FIND_PACKAGE(Threads)
  #If Threads found a copy of pthreads make sure it is one of the cases the tribits
  #tpl system cannot handle.
  IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
    IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
      SET(USE_THREADS TRUE)
    ENDIF()
  ENDIF()
 ENDIF()
 IF(USE_THREADS)
  SET(TPL_Pthread_INCLUDE_DIRS "")
  SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
  SET(TPL_Pthread_LIBRARY_DIRS "")
  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(Pthread)
 ELSE()
  TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
    REQUIRED_HEADERS pthread.h
    REQUIRED_LIBS_NAMES pthread
      )
 ENDIF()
--- a/lib/kokkos/cmake/deps/QTHREAD.cmake
+++ b/lib/kokkos/cmake/deps/QTHREAD.cmake
@ -0,0 +1,70 @@
 # @HEADER
 # ************************************************************************
 #
 #            Trilinos: An Object-Oriented Solver Framework
 #                 Copyright (2001) Sandia Corporation
 #
 #
 # Copyright (2001) Sandia Corporation. Under the terms of Contract
 # DE-AC04-94AL85000, there is a non-exclusive license for use of this
 # work by or on behalf of the U.S. Government.  Export of this program
 # may require a license from the United States Government.
 #
 # 1. Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright
 # notice, this list of conditions and the following disclaimer in the
 # documentation and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the Corporation nor the names of the
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # NOTICE:  The United States Government is granted for itself and others
 # acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
 # license in this data to reproduce, prepare derivative works, and
 # perform publicly and display publicly.  Beginning five (5) years from
 # July 25, 2001, the United States Government is granted for itself and
 # others acting on its behalf a paid-up, nonexclusive, irrevocable
 # worldwide license in this data to reproduce, prepare derivative works,
 # distribute copies to the public, perform publicly and display
 # publicly, and to permit others to do so.
 #
 # NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
 # OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
 # ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
 # RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
 # INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
 # THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
 #
 # ************************************************************************
 # @HEADER
 #-----------------------------------------------------------------------------
 #  Hardware locality detection and control library.
 #
 #  Acquisition information:
 #    Date checked:  July 2014
 #    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
 #    Source:        https://code.google.com/p/qthreads
 #
 TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
  REQUIRED_HEADERS qthread.h
  REQUIRED_LIBS_NAMES "qthread"
  )
--- a/lib/kokkos/cmake/tribits.cmake
+++ b/lib/kokkos/cmake/tribits.cmake
@ -0,0 +1,485 @@
 INCLUDE(CMakeParseArguments)
 INCLUDE(CTest)
 FUNCTION(ASSERT_DEFINED VARS)
  FOREACH(VAR ${VARS})
    IF(NOT DEFINED ${VAR})
      MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!")
    ENDIF()
  ENDFOREACH()
 ENDFUNCTION()
 MACRO(GLOBAL_SET VARNAME)
  SET(${VARNAME} ${ARGN} CACHE INTERNAL "")
 ENDMACRO()
 MACRO(PREPEND_GLOBAL_SET VARNAME)
  ASSERT_DEFINED(${VARNAME})
  GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}})
 ENDMACRO()
 FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME)
  ASSERT_DEFINED(${VARNAME})
  IF (${VARNAME})
    SET(TMP ${${VARNAME}})
    LIST(REMOVE_DUPLICATES TMP)
    GLOBAL_SET(${VARNAME} ${TMP})
  ENDIF()
 ENDFUNCTION()
 MACRO(TRIBITS_ADD_OPTION_AND_DEFINE  USER_OPTION_NAME  MACRO_DEFINE_NAME DOCSTRING  DEFAULT_VALUE)
  MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'")
  SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" )
  IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "")
    IF(${USER_OPTION_NAME})
      GLOBAL_SET(${MACRO_DEFINE_NAME} ON)
    ELSE()
      GLOBAL_SET(${MACRO_DEFINE_NAME} OFF)
    ENDIF()
  ENDIF()
 ENDMACRO()
 FUNCTION(TRIBITS_CONFIGURE_FILE  PACKAGE_NAME_CONFIG_FILE)
  # Configure the file
  CONFIGURE_FILE(
    ${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in
    ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE}
    )
 ENDFUNCTION()
 MACRO(TRIBITS_ADD_DEBUG_OPTION)
  TRIBITS_ADD_OPTION_AND_DEFINE(
    ${PROJECT_NAME}_ENABLE_DEBUG
    HAVE_${PROJECT_NAME_UC}_DEBUG
    "Enable a host of runtime debug checking."
    OFF
    )
 ENDMACRO()
 MACRO(TRIBITS_ADD_TEST_DIRECTORIES)
  FOREACH(TEST_DIR ${ARGN})
    ADD_SUBDIRECTORY(${TEST_DIR})
  ENDFOREACH()
 ENDMACRO()
 MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES)
  IF(${PACKAGE_NAME}_ENABLE_EXAMPLES OR ${PARENT_PACKAGE_NAME}_ENABLE_EXAMPLES)
    FOREACH(EXAMPLE_DIR ${ARGN})
      ADD_SUBDIRECTORY(${EXAMPLE_DIR})
    ENDFOREACH()
  ENDIF()
 ENDMACRO()
 MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT)
  SET(PROP_VALUES)
  FOREACH(TARGET_X ${ARGN})
    LIST(APPEND PROP_VALUES "$<TARGET_PROPERTY:${TARGET_X},${PROP_IN}>")
  ENDFOREACH()
  SET_TARGET_PROPERTIES(${TARGET_NAME} PROPERTIES ${PROP_OUT} "${PROP_VALUES}")
 ENDMACRO()
 MACRO(ADD_INTERFACE_LIBRARY LIB_NAME)
  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "")
  ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
  SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE)
 ENDMACRO()
 # Older versions of cmake does not make include directories transitive
 MACRO(TARGET_LINK_AND_INCLUDE_LIBRARIES TARGET_NAME)
  TARGET_LINK_LIBRARIES(${TARGET_NAME} LINK_PUBLIC ${ARGN})
  FOREACH(DEP_LIB ${ARGN})
    TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INTERFACE_INCLUDE_DIRECTORIES>)
    TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INCLUDE_DIRECTORIES>)
  ENDFOREACH()
 ENDMACRO()
 FUNCTION(TRIBITS_ADD_LIBRARY LIBRARY_NAME)
  SET(options STATIC SHARED TESTONLY NO_INSTALL_LIB_OR_HEADERS CUDALIBRARY)
  SET(oneValueArgs)
  SET(multiValueArgs HEADERS HEADERS_INSTALL_SUBDIR NOINSTALLHEADERS SOURCES DEPLIBS IMPORTEDLIBS DEFINES ADDED_LIB_TARGET_NAME_OUT)
  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  IF(PARSE_HEADERS)
    LIST(REMOVE_DUPLICATES PARSE_HEADERS)
  ENDIF()
  IF(PARSE_SOURCES)
    LIST(REMOVE_DUPLICATES PARSE_SOURCES)
  ENDIF()
  # Local variable to hold all of the libraries that will be directly linked
  # to this library.
  SET(LINK_LIBS ${${PACKAGE_NAME}_DEPS})
  # Add dependent libraries passed directly in
  IF (PARSE_IMPORTEDLIBS)
    LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
  ENDIF()
  IF (PARSE_DEPLIBS)
    LIST(APPEND LINK_LIBS ${PARSE_DEPLIBS})
  ENDIF()
  # Add the library and all the dependencies
  IF (PARSE_DEFINES)
    ADD_DEFINITIONS(${PARSE_DEFINES})
  ENDIF()
  IF (PARSE_STATIC)
    SET(STATIC_KEYWORD "STATIC")
  ELSE()
    SET(STATIC_KEYWORD)
  ENDIF()
  IF (PARSE_SHARED)
    SET(SHARED_KEYWORD "SHARED")
  ELSE()
    SET(SHARED_KEYWORD)
  ENDIF()
  IF (PARSE_TESTONLY)
    SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
  ELSE()
    SET(EXCLUDE_FROM_ALL_KEYWORD)
  ENDIF()
  IF (NOT PARSE_CUDALIBRARY)
    ADD_LIBRARY(
      ${LIBRARY_NAME}
      ${STATIC_KEYWORD}
      ${SHARED_KEYWORD}
      ${EXCLUDE_FROM_ALL_KEYWORD}
      ${PARSE_HEADERS}
      ${PARSE_NOINSTALLHEADERS}
      ${PARSE_SOURCES}
      )
  ELSE()
    CUDA_ADD_LIBRARY(
      ${LIBRARY_NAME}
      ${PARSE_HEADERS}
      ${PARSE_NOINSTALLHEADERS}
      ${PARSE_SOURCES}
      )
  ENDIF()
  TARGET_LINK_AND_INCLUDE_LIBRARIES(${LIBRARY_NAME} ${LINK_LIBS})
  IF (NOT PARSE_TESTONLY OR PARSE_NO_INSTALL_LIB_OR_HEADERS)
    INSTALL(
      TARGETS ${LIBRARY_NAME}
      EXPORT ${PROJECT_NAME}
      RUNTIME DESTINATION bin
      LIBRARY DESTINATION lib
      ARCHIVE DESTINATION lib
      COMPONENT ${PACKAGE_NAME}
      )
    INSTALL(
      FILES  ${PARSE_HEADERS}
      EXPORT ${PROJECT_NAME}
      DESTINATION include
      COMPONENT ${PACKAGE_NAME}
      )
      INSTALL(
      DIRECTORY  ${PARSE_HEADERS_INSTALL_SUBDIR}
      EXPORT ${PROJECT_NAME}
      DESTINATION include
      COMPONENT ${PACKAGE_NAME}
      )
  ENDIF()
  IF (NOT PARSE_TESTONLY)
    PREPEND_GLOBAL_SET(${PACKAGE_NAME}_LIBS ${LIBRARY_NAME})
    REMOVE_GLOBAL_DUPLICATES(${PACKAGE_NAME}_LIBS)
  ENDIF()
 ENDFUNCTION()
 FUNCTION(TRIBITS_ADD_EXECUTABLE EXE_NAME)
  SET(options NOEXEPREFIX NOEXESUFFIX ADD_DIR_TO_NAME INSTALLABLE TESTONLY)
  SET(oneValueArgs ADDED_EXE_TARGET_NAME_OUT)
  SET(multiValueArgs SOURCES CATEGORIES HOST XHOST HOSTTYPE XHOSTTYPE DIRECTORY TESTONLYLIBS IMPORTEDLIBS DEPLIBS COMM LINKER_LANGUAGE TARGET_DEFINES DEFINES)
  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  IF (PARSE_TARGET_DEFINES)
    TARGET_COMPILE_DEFINITIONS(${EXE_NAME} PUBLIC ${PARSE_TARGET_DEFINES})
  ENDIF()
  SET(LINK_LIBS PACKAGE_${PACKAGE_NAME})
  IF (PARSE_TESTONLYLIBS)
    LIST(APPEND LINK_LIBS ${PARSE_TESTONLYLIBS})
  ENDIF()
  IF (PARSE_IMPORTEDLIBS)
    LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
  ENDIF()
  SET (EXE_SOURCES)
  IF(PARSE_DIRECTORY)
    FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
      IF(IS_ABSOLUTE ${SOURCE_FILE})
        SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
      ELSE()
        SET (EXE_SOURCES ${EXE_SOURCES} ${PARSE_DIRECTORY}/${SOURCE_FILE})
      ENDIF()
    ENDFOREACH( )
  ELSE()
    FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
      SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
    ENDFOREACH( )
  ENDIF()
  SET(EXE_BINARY_NAME ${EXE_NAME})
  IF(DEFINED PACKAGE_NAME AND NOT PARSE_NOEXEPREFIX)
    SET(EXE_BINARY_NAME ${PACKAGE_NAME}_${EXE_BINARY_NAME})
  ENDIF()
  IF (PARSE_TESTONLY)
    SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
  ELSE()
    SET(EXCLUDE_FROM_ALL_KEYWORD)
  ENDIF()
  ADD_EXECUTABLE(${EXE_BINARY_NAME} ${EXCLUDE_FROM_ALL_KEYWORD} ${EXE_SOURCES})
  TARGET_LINK_AND_INCLUDE_LIBRARIES(${EXE_BINARY_NAME} ${LINK_LIBS})
  IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
    SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${EXE_BINARY_NAME} PARENT_SCOPE)
  ENDIF()
  IF(PARSE_INSTALLABLE)
    INSTALL(
      TARGETS ${EXE_BINARY_NAME}
      EXPORT ${PROJECT_NAME}
        DESTINATION bin
    )
  ENDIF()
 ENDFUNCTION()
 ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
 FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
  SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
  SET(oneValueArgs PASS_REGULAR_EXPRESSION FAIL_REGULAR_EXPRESSION ENVIRONMENT TIMEOUT CATEGORIES ADDED_TESTS_NAMES_OUT ADDED_EXE_TARGET_NAME_OUT)
  SET(multiValueArgs)
  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  TRIBITS_ADD_EXECUTABLE(${EXE_NAME} TESTONLY ADDED_EXE_TARGET_NAME_OUT TEST_NAME ${PARSE_UNPARSED_ARGUMENTS})
  IF(WIN32)
    ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${TEST_NAME}${CMAKE_EXECUTABLE_SUFFIX})
  ELSE()
    ADD_TEST(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
  ENDIF()
  ADD_DEPENDENCIES(check ${TEST_NAME})
  IF(PARSE_FAIL_REGULAR_EXPRESSION)
    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${PARSE_FAIL_REGULAR_EXPRESSION})
  ENDIF()
  IF(PARSE_PASS_REGULAR_EXPRESSION)
    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${PARSE_PASS_REGULAR_EXPRESSION})
  ENDIF()
  IF(PARSE_WILL_FAIL)
    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${PARSE_WILL_FAIL})
  ENDIF()
  IF(PARSE_ADDED_TESTS_NAMES_OUT)
    SET(${PARSE_ADDED_TESTS_NAMES_OUT} ${TEST_NAME} PARENT_SCOPE)
  ENDIF()
  IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
    SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${TEST_NAME} PARENT_SCOPE)
  ENDIF()
 ENDFUNCTION()
 MACRO(TIBITS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME)
  ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME})
  TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES})
  TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS})
 ENDMACRO()
 FUNCTION(TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME)
  SET(options MUST_FIND_ALL_LIBS MUST_FIND_ALL_HEADERS NO_PRINT_ENABLE_SUCCESS_FAIL)
  SET(oneValueArgs)
  SET(multiValueArgs REQUIRED_HEADERS REQUIRED_LIBS_NAMES)
  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE)
  IF (PARSE_REQUIRED_LIBS_NAMES)
    FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES})
    IF(NOT TPL_${TPL_NAME}_LIBRARIES)
      SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
    ENDIF()
  ENDIF()
  IF (PARSE_REQUIRED_HEADERS)
    FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS})
    IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS)
      SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
    ENDIF()
  ENDIF()
  IF (_${TPL_NAME}_ENABLE_SUCCESS)
    TIBITS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME})
  ENDIF()
 ENDFUNCTION()
 MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE)
  GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE)
  INCLUDE("${TPL_FILE}")
  IF(TARGET TPL_LIB_${TPL_NAME})
    MESSAGE(STATUS "Found tpl library: ${TPL_NAME}")
    SET(TPL_ENABLE_${TPL_NAME} TRUE)
  ELSE()
    MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}")
    SET(TPL_ENABLE_${TPL_NAME} FALSE)
  ENDIF()
 ENDMACRO()
 MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE)
  IF(TYPE STREQUAL "REQUIRED")
    SET(REQUIRED TRUE)
  ELSE()
    SET(REQUIRED FALSE)
  ENDIF()
  IF(TARGET ${TARGET_NAME})
    PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME})
  ELSE()
    IF(REQUIRED)
      MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}")
    ENDIF()
  ENDIF()
 ENDMACRO()
 MACRO(TRIBITS_APPEND_PACKAGE_DEPS DEP_LIST TYPE)
  FOREACH(DEP ${ARGN})
    PREPEND_GLOBAL_SET(${DEP_LIST} PACKAGE_${DEP})
  ENDFOREACH()
 ENDMACRO()
 MACRO(TRIBITS_APPEND_TPLS_DEPS DEP_LIST TYPE)
  FOREACH(DEP ${ARGN})
    PREPEND_TARGET_SET(${DEP_LIST} TPL_LIB_${DEP} ${TYPE})
  ENDFOREACH()
 ENDMACRO()
 MACRO(TRIBITS_ENABLE_TPLS)
  FOREACH(TPL ${ARGN})
    IF(TARGET ${TPL})
      GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} TRUE)
    ELSE()
      GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} FALSE)
    ENDIF()
  ENDFOREACH()
 ENDMACRO()
 MACRO(TRIBITS_PACKAGE_DEFINE_DEPENDENCIES)
  SET(options)
  SET(oneValueArgs)
  SET(multiValueArgs 
    LIB_REQUIRED_PACKAGES
    LIB_OPTIONAL_PACKAGES
    TEST_REQUIRED_PACKAGES
    TEST_OPTIONAL_PACKAGES
    LIB_REQUIRED_TPLS
    LIB_OPTIONAL_TPLS
    TEST_REQUIRED_TPLS
    TEST_OPTIONAL_TPLS
    REGRESSION_EMAIL_LIST
    SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
  )
  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  GLOBAL_SET(${PACKAGE_NAME}_DEPS "")
  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_PACKAGES})
  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_PACKAGES})
  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_TPLS})
  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_TPLS})
  GLOBAL_SET(${PACKAGE_NAME}_TEST_DEPS "")
  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_PACKAGES})
  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_PACKAGES})
  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_TPLS})
  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_TPLS})
  TRIBITS_ENABLE_TPLS(${PARSE_LIB_REQUIRED_TPLS} ${PARSE_LIB_OPTIONAL_TPLS} ${PARSE_TEST_REQUIRED_TPLS} ${PARSE_TEST_OPTIONAL_TPLS})
 ENDMACRO()
 MACRO(TRIBITS_SUBPACKAGE NAME)
  SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
  SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME})
  SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME})
  STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
  ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME})
  GLOBAL_SET(${PACKAGE_NAME}_LIBS "")
  INCLUDE(${PACKAGE_SOURCE_DIR}/cmake/Dependencies.cmake)
 ENDMACRO(TRIBITS_SUBPACKAGE)
 MACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
  TARGET_LINK_AND_INCLUDE_LIBRARIES(PACKAGE_${PACKAGE_NAME} ${${PACKAGE_NAME}_LIBS})
 ENDMACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
 MACRO(TRIBITS_PACKAGE_DECL NAME)
  PROJECT(${NAME})
  STRING(TOUPPER ${PROJECT_NAME} PROJECT_NAME_UC)
  SET(PACKAGE_NAME ${PROJECT_NAME})
  STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
  SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps")
  FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake")
  FOREACH(TPL_FILE ${TPLS_FILES})
    TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE})
  ENDFOREACH()
 ENDMACRO()
 MACRO(TRIBITS_PROCESS_SUBPACKAGES)
  FILE(GLOB SUBPACKAGES RELATIVE ${CMAKE_SOURCE_DIR} */cmake/Dependencies.cmake)
  FOREACH(SUBPACKAGE ${SUBPACKAGES})
    GET_FILENAME_COMPONENT(SUBPACKAGE_CMAKE ${SUBPACKAGE} DIRECTORY)
    GET_FILENAME_COMPONENT(SUBPACKAGE_DIR ${SUBPACKAGE_CMAKE} DIRECTORY)
    ADD_SUBDIRECTORY(${SUBPACKAGE_DIR})
  ENDFOREACH()
 ENDMACRO(TRIBITS_PROCESS_SUBPACKAGES)
 MACRO(TRIBITS_PACKAGE_DEF)
 ENDMACRO(TRIBITS_PACKAGE_DEF)
 MACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
 ENDMACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
 MACRO(TRIBITS_EXCLUDE_FILES)
 ENDMACRO(TRIBITS_EXCLUDE_FILES)
 MACRO(TRIBITS_PACKAGE_POSTPROCESS)
 ENDMACRO(TRIBITS_PACKAGE_POSTPROCESS)
--- a/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt
+++ b/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt
@ -0,0 +1,153 @@
 // -------------------------------------------------------------------------------- //
 The following steps are for workstations/servers with the SEMS environment installed.
 // -------------------------------------------------------------------------------- //
 Summary:
 - Step 1: Rigorous testing of Kokkos' develop branch for each backend (Serial, OpenMP, Threads, Cuda) with all supported compilers.
 - Step 2: Snapshot Kokkos' develop branch into current Trilinos develop branch.
 - Step 3: Build and test Trilinos with combinations of compilers, types, backends.
 - Step 4: Promote Kokkos develop branch to master if the snapshot does not cause any new tests to fail; else track/fix causes of new failures.
 - Step 5: Snapshot Kokkos tagged master branch into Trilinos and push Trilinos.
 // -------------------------------------------------------------------------------- //
 // -------------------------------------------------------------------------------- //
 Step 1:
  1.1. Update kokkos develop branch (NOT a fork)
         (From kokkos directory):
         git fetch --all
         git checkout develop
         git reset --hard origin/develop
  1.2. Create a testing directory - here the directory is created within the kokkos directory
         mkdir testing
         cd testing
  1.3. Run the test_all_sandia script; various compiler and build-list options can be specified
         ../config/test_all_sandia
  1.4 Clean repository of untracked files
        cd ../
        git clean -df
 // -------------------------------------------------------------------------------- //
 Step 2:
  2.1 Update Trilinos develop branch
        (From Trilinos directory):
        git checkout develop
        git fetch --all
        git reset --hard origin/develop
        git clean -df
  2.2 Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files
        module load python/2.7.9
        python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
 // -------------------------------------------------------------------------------- //
 Step 3:
  3.1. Build and test Trilinos with 3 different configurations; a configure-all script is provided in Trilinos and should be modified to test each of the following 3 configurations with appropriate environment variable(s):
      - GCC/4.7.2-OpenMP/Complex
          Run tests with the following environment variable:
            export OMP_NUM_THREADS=2
      - Intel/15.0.2-Serial/NoComplex
      - GCC/4.8.4/CUDA/7.5.18-Cuda/Serial/NoComplex
          Run tests with the following environment variables:
            export CUDA_LAUNCH_BLOCKING=1
            export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
        mkdir Build
        cd Build
        cp TRILINOS_PATH/sampleScripts/Sandia-SEMS/configure-all ./
            ** Set the path to Trilinos appropriately within the configure-all script **
        source $SEMS_MODULE_ROOT/utils/sems-modules-init.sh kokkos
        source configure-all
        make -k  (-k means "keep going" to get past build errors; -j12 can also be specified to build with 12 threads, for example)
        ctest
  3.2. Compare the failed test output to the test output on the dashboard ( testing.sandia.gov/cdash select Trilinos ); investigate and fix problems if new tests fail after the Kokkos snapshot
 // -------------------------------------------------------------------------------- //
 Step 4:
  4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github
       - DO NOT fast-forward the merge!!!!
       (From kokkos directory):
       git checkout master
       git fetch --all
       # Ensure we are on the current origin/master
       git reset --hard origin/master
       git merge --no-ff origin/develop
  4.2. Update the tag in kokkos/config/master_history.txt
       Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
       Tag format: #.#.##
       # Prepend master_history.txt with 
       # tag: #.#.##
       # date: mm/dd/yyyy
       # master: sha1
       # develop: sha1
       # -----------------------
       git commit --amend -a
       git tag -a #.#.##
         tag: #.#.##
         date: mm/dd/yyyy
         master: sha1
         develop: sha1
       git push --follow-tags origin master
 // -------------------------------------------------------------------------------- //
 Step 5:
  5.1. Make sure Trilinos is up-to-date - chances are other changes have been committed since the integration testing process began. If a substantial change has occurred that may be affected by the snapshot the testing procedure may need to be repeated
       (From Trilinos directory):
       git checkout develop
       git fetch --all
       git reset --hard origin/develop
       git clean -df
  5.2. Snapshot Kokkos master branch into Trilinos
       (From kokkos directory):
       git fetch --all
       git checkout tags/#.#.##
       git clean -df
       python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
  5.3. Push the updated develop branch of Trilinos to Github - congratulations!!!
       (From Trilinos directory):
       git push
 // -------------------------------------------------------------------------------- //
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@ -0,0 +1,3 @@
 tag:  2.01.00    date: 07:21:2016    master: xxxxxxxx    develop: fa6dfcc4
 tag:  2.01.06    date: 09:02:2016    master: 9afaa87f    develop: 555f1a3a
--- a/lib/kokkos/config/nvcc_wrapper
+++ b/lib/kokkos/config/nvcc_wrapper
@ -1,17 +1,12 @@
 #!/bin/bash
 #
 # This shell script (nvcc_wrapper) wraps both the host compiler and
-# NVCC, if you are building Trilinos with CUDA enabled.  The script
+# NVCC, if you are building legacy C or C++ code with CUDA enabled.
-# remedies some differences between the interface of NVCC and that of
+# The script remedies some differences between the interface of NVCC
-# the host compiler, in particular for linking.  It also means that
+# and that of the host compiler, in particular for linking.
-# Trilinos doesn't need separate .cu files; it can just use .cpp
+# It also means that a legacy code doesn't need separate .cu files;
-# files.
+# it can just use .cpp files.
 #
 # Hopefully, at some point, NVIDIA may fix NVCC so as to make this
 # script obsolete.  For now, this script exists and if you want to
 # build Trilinos with CUDA enabled, you must use this script as your
 # compiler.
 # Default settings: change those according to your machine.  For
 # example, you may have have two different wrappers with either icpc
 # or g++ as their back-end compiler.  The defaults can be overwritten
@ -53,6 +48,10 @@ object_files=""
 # Link objects for the host linker only
 object_files_xlinker=""
 # Shared libraries with version numbers are not handled correctly by NVCC
 shared_versioned_libraries_host=""
 shared_versioned_libraries=""
 # Does the User set the architecture 
 arch_set=0
@ -76,6 +75,9 @@ first_xcompiler_arg=1
 temp_dir=${TMPDIR:-/tmp}
 # Check if we have an optimization argument already
 optimization_applied=0
 #echo "Arguments: $# $@"
 while [ $# -gt 0 ]
@ -97,8 +99,17 @@ do
  *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
    cpp_files="$cpp_files $1"
    ;;
   # Ensure we only have one optimization flag because NVCC doesn't allow muliple
  -O*)
    if [ $optimization_applied -eq 1 ]; then
       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting."
    else
       shared_args="$shared_args $1"
       optimization_applied=1
    fi
    ;;
  #Handle shared args (valid for both nvcc and the host compiler)
-  -O*|-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
+  -D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
    shared_args="$shared_args $1"
    ;;
  #Handle shared args that have an argument
@ -107,7 +118,7 @@ do
    shift
    ;;
  #Handle known nvcc args
-  -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage)
+  -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
    cuda_args="$cuda_args $1"
    ;;
  #Handle known nvcc args that have an argument
@ -175,10 +186,15 @@ do
    object_files_xlinker="$object_files_xlinker -Xlinker $1"
    ;;
  #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
-  *.so.*|*.dylib)
+  *.dylib)
    object_files="$object_files -Xlinker $1"
    object_files_xlinker="$object_files_xlinker -Xlinker $1"
    ;;
  #Handle shared libraries with *.so.* names which nvcc can't do.
  *.so.*)
    shared_versioned_libraries_host="$shared_versioned_libraries_host $1"
    shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1"
  ;;
  #All other args are sent to the host compiler
  *)
    if [ $first_xcompiler_arg -eq 1 ]; then
@ -204,13 +220,13 @@ if [ $arch_set -ne 1 ]; then
 fi
 #Compose compilation command
-nvcc_command="nvcc $cuda_args $shared_args $xlinker_args"
+nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
 if [ $first_xcompiler_arg -eq 0 ]; then
  nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
 fi
 #Compose host only command
-host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args"
+host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host"
 #nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
 if [ $replace_pragma_ident -eq 1 ]; then
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@ -6,34 +6,36 @@
 set -o pipefail
 # Determine current machine
 MACHINE=""
 HOSTNAME=$(hostname)
 if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
    MACHINE=white
 elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
    MACHINE=bowman
 elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
    MACHINE=shepard
 elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
    MACHINE=sems
 else
    echo "Unrecognized machine" >&2
    exit 1
 fi
 GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
 IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
 CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
 CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
 GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
 IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 CUDA_WARNING_FLAGS=""
-BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
+# Default. Machine specific can override
 CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
 export OMP_NUM_THREADS=4
 declare -i NUM_RESULTS_TO_KEEP=7
 RESULT_ROOT_PREFIX=TestAll
 source /projects/modulefiles/utils/sems-modules-init.sh
 source /projects/modulefiles/utils/kokkos-modules-init.sh
 SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
 #
 # Handle arguments
 #
 DEBUG=False
 ARGS=""
 CUSTOM_BUILD_LIST=""
@ -41,6 +43,107 @@ DRYRUN=False
 BUILD_ONLY=False
 declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
 TEST_SCRIPT=False
 SKIP_HWLOC=False
 ARCH_FLAG=""
 #
 # Machine specific config
 #
 if [ "$MACHINE" = "sems" ]; then
    source /projects/modulefiles/utils/sems-modules-init.sh
    source /projects/modulefiles/utils/kokkos-modules-init.sh
    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
    CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
    # Format: (compiler module-list build-list exe-name warning-flag)
    COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
               "cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
               "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
               "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
    )
 elif [ "$MACHINE" = "white" ]; then
    source /etc/profile.d/modules.sh
    SKIP_HWLOC=True
    export SLURM_TASKS_PER_NODE=32
    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
    IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
    CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.9.2"
    # Don't do pthread on white
    GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
    # Format: (compiler module-list build-list exe-name warning-flag)
    COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
    )
    ARCH_FLAG="--arch=Power8"
    NUM_JOBS_TO_RUN_IN_PARALLEL=8
 elif [ "$MACHINE" = "bowman" ]; then
    source /etc/profile.d/modules.sh
    SKIP_HWLOC=True
    export SLURM_TASKS_PER_NODE=32
    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
    # Format: (compiler module-list build-list exe-name warning-flag)
    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
    )
    ARCH_FLAG="--arch=KNL"
    NUM_JOBS_TO_RUN_IN_PARALLEL=8
 elif [ "$MACHINE" = "shepard" ]; then
    source /etc/profile.d/modules.sh
    SKIP_HWLOC=True
    export SLURM_TASKS_PER_NODE=32
    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
    # Format: (compiler module-list build-list exe-name warning-flag)
    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
    )
    ARCH_FLAG="--arch=HSW"
    NUM_JOBS_TO_RUN_IN_PARALLEL=8
 else
    echo "Unhandled machine $MACHINE" >&2
    exit 1
 fi
 export OMP_NUM_THREADS=4
 declare -i NUM_RESULTS_TO_KEEP=7
 RESULT_ROOT_PREFIX=TestAll
 SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
 #
 # Handle arguments
 #
 while [[ $# > 0 ]]
 do
@ -61,6 +164,9 @@ BUILD_ONLY=True
 --test-script*)
 TEST_SCRIPT=True
 ;;
 --skip-hwloc*)
 SKIP_HWLOC=True
 ;;
 --num*)
 NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
 ;;
@ -73,6 +179,7 @@ echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
 echo "    Defaults to root repo containing this script"
 echo "--debug: Run tests in debug. Defaults to False"
 echo "--test-script: Test this script, not Kokkos"
 echo "--skip-hwloc: Do not do hwloc tests"
 echo "--num=N: Number of jobs to run in parallel "
 echo "--dry-run: Just print what would be executed"
 echo "--build-only: Just do builds, don't run anything"
@ -82,21 +189,16 @@ echo "    Valid items:"
 echo "      OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
 echo "      Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
 echo ""
 echo "ARGS: list of expressions matching compilers to test"
-echo "  supported compilers"
+echo "  supported compilers sems"
-echo "    gcc/4.7.2"
+for COMPILER_DATA in "${COMPILERS[@]}"; do
-echo "    gcc/4.8.4"
+    ARR=($COMPILER_DATA)
-echo "    gcc/4.9.2"
+    COMPILER=${ARR[0]}
-echo "    gcc/5.1.0"
+    echo "    $COMPILER"
-echo "    intel/14.0.4"
+done
 echo "    intel/15.0.2"
 echo "    intel/16.0.1"
 echo "    clang/3.5.2"
 echo "    clang/3.6.1"
 echo "    cuda/6.5.14"
 echo "    cuda/7.0.28"
 echo "    cuda/7.5.18"
 echo ""
 echo "Examples:"
 echo "  Run all tests"
 echo "  % test_all_sandia"
@ -147,21 +249,6 @@ if [ -z "$ARGS" ]; then
    ARGS='?'
 fi
 # Format: (compiler module-list build-list exe-name warning-flag)
 COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
           "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
           "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
           "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
           "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
           "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
           "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
           "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
           "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
           "cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
           "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
           "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
           )
 # Process args to figure out which compilers to test
 COMPILERS_TO_TEST=""
 for ARG in $ARGS; do
@ -240,18 +327,19 @@ run_cmd() {
    fi
 }
-# report_and_log_test_results <SUCCESS> <DESC> <PHASE>
+# report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
 report_and_log_test_result() {
    # Use sane var names
-    local success=$1; local desc=$2; local phase=$3;
+    local success=$1; local desc=$2; local comment=$3;
    if [ "$success" = "0" ]; then
 	echo "  PASSED $desc"
-        touch $PASSED_DIR/$desc
+        echo $comment > $PASSED_DIR/$desc
    else
        # For failures, comment should be the name of the phase that failed
 	echo "  FAILED $desc" >&2
-        echo $phase > $FAILED_DIR/$desc
+        echo $comment > $FAILED_DIR/$desc
-        cat ${desc}.${phase}.log
+        cat ${desc}.${comment}.log
    fi
 }
@ -309,6 +397,8 @@ single_build_and_test() {
    echo "  Starting job $desc"
    local comment="no_comment"
    if [ "$TEST_SCRIPT" = "True" ]; then
        local rand=$[ 1 + $[ RANDOM % 10 ]]
        sleep $rand
@ -316,14 +406,19 @@ single_build_and_test() {
            run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
        fi
    else
-        run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+        run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
        local -i build_start_time=$(date +%s)
        run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
        local -i build_end_time=$(date +%s)
        comment="build_time=$(($build_end_time-$build_start_time))"
        if [[ "$BUILD_ONLY" == False ]]; then
            run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
            local -i run_end_time=$(date +%s)
            comment="$comment run_time=$(($run_end_time-$build_end_time))"
        fi
    fi
-    report_and_log_test_result 0 $desc
+    report_and_log_test_result 0 $desc "$comment"
    return 0
 }
@ -374,7 +469,7 @@ build_and_test_all() {
 	run_in_background $compiler $build $BUILD_TYPE
        # If not cuda, do a hwloc test too
-        if [[ "$compiler" != cuda* ]]; then
+        if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
            run_in_background $compiler $build "hwloc-$BUILD_TYPE"
        fi
    done
@ -401,7 +496,11 @@ wait_summarize_and_exit() {
    echo "PASSED TESTS"
    echo "#######################################################"
-    \ls -1 $PASSED_DIR | sort
+    local passed_test
    for passed_test in $(\ls -1 $PASSED_DIR | sort)
    do
        echo $passed_test $(cat $PASSED_DIR/$passed_test)
    done
    echo "#######################################################"
    echo "FAILED TESTS"
@ -409,7 +508,7 @@ wait_summarize_and_exit() {
    local failed_test
    local -i rv=0
-    for failed_test in $(\ls -1 $FAILED_DIR)
+    for failed_test in $(\ls -1 $FAILED_DIR | sort)
    do
        echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
        rv=$rv+1
--- a/lib/kokkos/containers/performance_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt
@ -16,11 +16,22 @@ IF(Kokkos_ENABLE_OpenMP)
  LIST( APPEND SOURCES TestOpenMP.cpp)
 ENDIF()
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+# Per #374, we always want to build this test, but we only want to run
-  PerformanceTest
+# it as a PERFORMANCE test.  That's why we separate building the test
 # from running the test.
 TRIBITS_ADD_EXECUTABLE(
  PerfTestExec
  SOURCES ${SOURCES}
  COMM serial mpi
  NUM_MPI_PROCS 1
  FAIL_REGULAR_EXPRESSION "  FAILED  "
  TESTONLYLIBS kokkos_gtest
  )
 TRIBITS_ADD_TEST(
  PerformanceTest
  NAME PerfTestExec
  COMM serial mpi
  NUM_MPI_PROCS 1
  CATEGORIES PERFORMANCE
  FAIL_REGULAR_EXPRESSION "  FAILED  "
  )
--- a/lib/kokkos/containers/performance_tests/TestCuda.cpp
+++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp
@ -54,6 +54,8 @@
 #if defined( KOKKOS_HAVE_CUDA )
 #include <TestDynRankView.hpp>
 #include <Kokkos_UnorderedMap.hpp>
 #include <TestGlobal2LocalIds.hpp>
@ -77,6 +79,13 @@ protected:
  }
 };
 TEST_F( cuda, dynrankview_perf ) 
 {
  std::cout << "Cuda" << std::endl;
  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
  test_dynrankview_op_perf<Kokkos::Cuda>( 4096 );
 }
 TEST_F( cuda, global_2_local)
 {
  std::cout << "Cuda" << std::endl;
--- a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
@ -0,0 +1,265 @@
 //@HEADER
 // ************************************************************************
 // 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 // 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 // 
 // ************************************************************************
 //@HEADER
 #ifndef KOKKOS_TEST_DYNRANKVIEW_HPP
 #define KOKKOS_TEST_DYNRANKVIEW_HPP
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DynRankView.hpp>
 #include <vector>
 #include <impl/Kokkos_Timer.hpp>
 // Compare performance of DynRankView to View, specific focus on the parenthesis operators
 namespace Performance {
 //View functor
 template <typename DeviceType>
 struct InitViewFunctor {
  typedef Kokkos::View<double***, DeviceType> inviewtype;
  inviewtype _inview;
  InitViewFunctor( inviewtype &inview_ ) : _inview(inview_)
  {}
  KOKKOS_INLINE_FUNCTION
  void operator()(const int i) const {
    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
        _inview(i,j,k) = i/2 -j*j + k/3;
      }
    }
  }
  struct SumComputationTest
  {
    typedef Kokkos::View<double***, DeviceType> inviewtype;
    inviewtype _inview;
    typedef Kokkos::View<double*, DeviceType> outviewtype;
    outviewtype _outview;
    KOKKOS_INLINE_FUNCTION
    SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
    KOKKOS_INLINE_FUNCTION
    void operator()(const int i) const {
      for (unsigned j = 0; j < _inview.dimension(1); ++j) {
        for (unsigned k = 0; k < _inview.dimension(2); ++k) {
          _outview(i) += _inview(i,j,k) ;
        }
      }
    }
  };
 };
 template <typename DeviceType>
 struct InitStrideViewFunctor {
  typedef Kokkos::View<double***, Kokkos::LayoutStride, DeviceType> inviewtype;
  inviewtype _inview;
  InitStrideViewFunctor( inviewtype &inview_ ) : _inview(inview_)
  {}
  KOKKOS_INLINE_FUNCTION
  void operator()(const int i) const {
    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
        _inview(i,j,k) = i/2 -j*j + k/3;
      }
    }
  }
 };
 template <typename DeviceType>
 struct InitViewRank7Functor {
  typedef Kokkos::View<double*******, DeviceType> inviewtype;
  inviewtype _inview;
  InitViewRank7Functor( inviewtype &inview_ ) : _inview(inview_)
  {}
  KOKKOS_INLINE_FUNCTION
  void operator()(const int i) const {
    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
        _inview(i,j,k,0,0,0,0) = i/2 -j*j + k/3;
      }
    }
  }
 };
 //DynRankView functor
 template <typename DeviceType>
 struct InitDynRankViewFunctor {
  typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
  inviewtype _inview;
  InitDynRankViewFunctor( inviewtype &inview_ ) : _inview(inview_)
  {}
  KOKKOS_INLINE_FUNCTION
  void operator()(const int i) const {
    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
        _inview(i,j,k) = i/2 -j*j + k/3;
      }
    }
  }
  struct SumComputationTest
  {
    typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
    inviewtype _inview;
    typedef Kokkos::DynRankView<double, DeviceType> outviewtype;
    outviewtype _outview;
    KOKKOS_INLINE_FUNCTION
    SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
    KOKKOS_INLINE_FUNCTION
    void operator()(const int i) const {
      for (unsigned j = 0; j < _inview.dimension(1); ++j) {
        for (unsigned k = 0; k < _inview.dimension(2); ++k) {
          _outview(i) += _inview(i,j,k) ;
        }
      }
    }
  };
 };
 template <typename DeviceType>
 void test_dynrankview_op_perf( const int par_size )
 {
  typedef DeviceType execution_space;
  typedef typename execution_space::size_type size_type;
  const size_type dim2 = 900;
  const size_type dim3 = 300;
  double elapsed_time_view = 0;
  double elapsed_time_compview = 0;
  double elapsed_time_strideview = 0;
  double elapsed_time_view_rank7 = 0;
  double elapsed_time_drview = 0;
  double elapsed_time_compdrview = 0;
  Kokkos::Timer timer;
  {
    Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3);
    typedef InitViewFunctor<DeviceType> FunctorType;
    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testview) );
    DeviceType::fence();
    elapsed_time_view = timer.seconds();
    std::cout << " View time (init only): " << elapsed_time_view << std::endl;
    timer.reset();
    Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
    DeviceType::fence();
    elapsed_time_compview = timer.seconds();
    std::cout << " View sum computation time: " << elapsed_time_view << std::endl;
    Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL);
    typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;
    timer.reset();
    Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
    DeviceType::fence();
    elapsed_time_strideview = timer.seconds();
    std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
  }
  {
    Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1);
    typedef InitViewRank7Functor<DeviceType> FunctorType;
    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testview) );
    DeviceType::fence();
    elapsed_time_view_rank7 = timer.seconds();
    std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
  }
  {
    Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3);
    typedef InitDynRankViewFunctor<DeviceType> FunctorType;
    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
    Kokkos::parallel_for( policy , FunctorType(testdrview) );
    DeviceType::fence();
    elapsed_time_drview = timer.seconds();
    std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;
    timer.reset();
    Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
    DeviceType::fence();
    elapsed_time_compdrview = timer.seconds();
    std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;
  }
  std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1
  std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1
  std::cout << " Ratio of View to View Rank7  time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1
  std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1
  std::cout << " Ratio of DynRankView to View Rank7  time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ?
  timer.reset();
 } //end test_dynrankview
 } //end Performance
 #endif
--- a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@ -178,7 +178,7 @@ void test_global_to_local_ids(unsigned num_ids)
  std::cout << num_ids << ", ";
  double elasped_time = 0;
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
  local_id_view local_2_global("local_ids", num_ids);
  global_id_view global_2_local((3u*num_ids)/2u);
--- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
+++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
@ -50,6 +50,8 @@
 #include <TestGlobal2LocalIds.hpp>
 #include <TestUnorderedMapPerformance.hpp>
 #include <TestDynRankView.hpp>
 #include <iomanip>
 #include <sstream>
 #include <string>
@ -91,6 +93,13 @@ protected:
  }
 };
 TEST_F( openmp, dynrankview_perf ) 
 {
  std::cout << "OpenMP" << std::endl;
  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
  test_dynrankview_op_perf<Kokkos::OpenMP>( 8192 );
 }
 TEST_F( openmp, global_2_local)
 {
  std::cout << "OpenMP" << std::endl;
--- a/lib/kokkos/containers/performance_tests/TestThreads.cpp
+++ b/lib/kokkos/containers/performance_tests/TestThreads.cpp
@ -52,6 +52,8 @@
 #include <TestGlobal2LocalIds.hpp>
 #include <TestUnorderedMapPerformance.hpp>
 #include <TestDynRankView.hpp>
 #include <iomanip>
 #include <sstream>
 #include <string>
@ -85,6 +87,13 @@ protected:
  }
 };
 TEST_F( threads, dynrankview_perf ) 
 {
  std::cout << "Threads" << std::endl;
  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
  test_dynrankview_op_perf<Kokkos::Threads>( 8192 );
 }
 TEST_F( threads, global_2_local)
 {
  std::cout << "Threads" << std::endl;
--- a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@ -80,7 +80,7 @@ struct UnorderedMapTest
    , map(capacity)
    , histogram(map.get_histogram())
  {
-    Kokkos::Impl::Timer wall_clock ;
+    Kokkos::Timer wall_clock ;
    wall_clock.reset();
    value_type v = {};
@ -228,7 +228,7 @@ void run_performance_tests(std::string const & base_file_name)
  distance_out << "\b\b\b   " << std::endl;
  block_distance_out << "\b\b\b   " << std::endl;
-  Kokkos::Impl::Timer wall_clock ;
+  Kokkos::Timer wall_clock ;
  for (int i=0;  i < num_collisions ; ++i) {
    wall_clock.reset();
    std::cout << "Collisions: " << collisions[i] << std::endl;
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@ -77,10 +77,7 @@ private:
 public:
-  typedef Kokkos::Experimental::MemoryPool
+  typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ;
    < typename traits::memory_space
    , typename traits::execution_space
    > memory_pool ;
 private:
@ -338,7 +335,7 @@ public:
    void operator()( unsigned i ) const
      {
        if ( m_destroy && i < m_chunk_max && 0 != m_chunks[i] ) {
-          m_pool.deallocate( m_chunks[i] , m_pool.get_min_chunk_size() );
+          m_pool.deallocate( m_chunks[i] , m_pool.get_min_block_size() );
        }
        m_chunks[i] = 0 ;
      }
@ -397,7 +394,7 @@ public:
    // The memory pool chunk is guaranteed to be a power of two
    , m_chunk_shift(
        Kokkos::Impl::integral_power_of_two(
-          m_pool.get_min_chunk_size()/sizeof(typename traits::value_type)) )
+          m_pool.get_min_block_size()/sizeof(typename traits::value_type)) )
    , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
    , m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
    {
--- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@ -45,6 +45,7 @@
 #define KOKKOS_BITSET_IMPL_HPP
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_BitOps.hpp>
 #include <stdint.h>
 #include <cstdio>
@ -52,82 +53,17 @@
 #include <iostream>
 #include <iomanip>
-namespace Kokkos { namespace Impl {
+namespace Kokkos {
 namespace Impl {
 KOKKOS_FORCEINLINE_FUNCTION
-unsigned rotate_right(unsigned i, int r)
+unsigned rotate_right( unsigned i, int r )
 {
-  enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) };
+  enum { size = static_cast<int>( sizeof(unsigned) * CHAR_BIT ) };
-  return r ? ((i >> r) | (i << (size-r))) : i ;
+  return r ? ( ( i >> r ) | ( i << ( size - r ) ) ) : i ;
 }
-KOKKOS_FORCEINLINE_FUNCTION
+template < typename Bitset >
 int bit_scan_forward(unsigned i)
 {
 #if defined( __CUDA_ARCH__ )
  return __ffs(i) - 1;
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
  return __builtin_ffs(i) - 1;
 #elif defined( __INTEL_COMPILER )
  return _bit_scan_forward(i);
 #else
  unsigned t = 1u;
  int r = 0;
  while (i && (i & t == 0))
  {
    t = t << 1;
    ++r;
  }
  return r;
 #endif
 }
 KOKKOS_FORCEINLINE_FUNCTION
 int bit_scan_reverse(unsigned i)
 {
  enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) };
 #if defined( __CUDA_ARCH__ )
  return shift - __clz(i);
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
  return shift - __builtin_clz(i);
 #elif defined( __INTEL_COMPILER )
  return _bit_scan_reverse(i);
 #else
  unsigned t = 1u << shift;
  int r = 0;
  while (i && (i & t == 0))
  {
    t = t >> 1;
    ++r;
  }
  return r;
 #endif
 }
 // count the bits set
 KOKKOS_FORCEINLINE_FUNCTION
 int popcount(unsigned i)
 {
 #if defined( __CUDA_ARCH__ )
  return __popc(i);
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
  return __builtin_popcount(i);
 #elif defined ( __INTEL_COMPILER )
  return _popcnt32(i);
 #else
  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
  i = i - ((i >> 1) & ~0u/3u);                                         // temp
  i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u);                      // temp
  i = (i + (i >> 4)) & ~0u/255u*15u;                                   // temp
  return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count
 #endif
 }
 template <typename Bitset>
 struct BitsetCount
 {
  typedef Bitset                                                  bitset_type;
@ -137,37 +73,37 @@ struct BitsetCount
  bitset_type m_bitset;
-  BitsetCount( bitset_type const& bitset)
+  BitsetCount( bitset_type const& bitset )
    : m_bitset(bitset)
  {}
  size_type apply() const
  {
    size_type count = 0u;
-    parallel_reduce(m_bitset.m_blocks.dimension_0(), *this, count);
+    parallel_reduce( m_bitset.m_blocks.dimension_0(), *this, count );
    return count;
  }
  KOKKOS_INLINE_FUNCTION
-  static void init( value_type & count)
+  void init( value_type & count ) const
  {
    count = 0u;
  }
  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & count, const volatile size_type & incr )
+  void join( volatile value_type & count, const volatile size_type & incr ) const
  {
    count += incr;
  }
  KOKKOS_INLINE_FUNCTION
-  void operator()( size_type i, value_type & count) const
+  void operator()( size_type i, value_type & count ) const
  {
-    count += popcount(m_bitset.m_blocks[i]);
+    count += bit_count( m_bitset.m_blocks[i] );
  }
 };
-}} //Kokkos::Impl
+} // namespace Impl
 } // namespace Kokkos
 #endif // KOKKOS_BITSET_IMPL_HPP
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@ -713,13 +713,20 @@ public:
  typedef Kokkos::Experimental::DynRankView< const T , device > const_dView0 ;
  typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ;
-  typedef typename dView0::host_mirror_space host ;
+  typedef typename dView0::host_mirror_space host_drv_space ;
  typedef Kokkos::Experimental::View< T , device >        View0 ;
  typedef Kokkos::Experimental::View< T* , device >       View1 ;
  typedef Kokkos::Experimental::View< T******* , device > View7 ;
  typedef typename View0::host_mirror_space  host_view_space ;
  TestDynViewAPI()
  {
    run_test_resize_realloc();
    run_test_mirror();
    run_test();
    run_test_scalar();
    run_test();
    run_test_const();
    run_test_subview();
    run_test_subview_strided();
@ -735,19 +742,147 @@ public:
    TestViewOperator_LeftAndRight< int , device , 1 >::testit(2);
  }
  static void run_test_resize_realloc()
  {
    dView0 drv0("drv0", 10, 20, 30);
    ASSERT_EQ( drv0.rank(), 3);
    Kokkos::Experimental::resize(drv0, 5, 10);
    ASSERT_EQ( drv0.rank(), 2);
    ASSERT_EQ( drv0.dimension_0(), 5);
    ASSERT_EQ( drv0.dimension_1(), 10);
    ASSERT_EQ( drv0.dimension_2(), 1);
    Kokkos::Experimental::realloc(drv0, 10, 20);
    ASSERT_EQ( drv0.rank(), 2);
    ASSERT_EQ( drv0.dimension_0(), 10);
    ASSERT_EQ( drv0.dimension_1(), 20);
    ASSERT_EQ( drv0.dimension_2(), 1);
  }
  static void run_test_mirror()
  {
-    typedef Kokkos::Experimental::DynRankView< int , host > view_type ;
+    typedef Kokkos::Experimental::DynRankView< int , host_drv_space > view_type ;
    typedef typename view_type::HostMirror mirror_type ;
    view_type a("a");
    mirror_type am = Kokkos::Experimental::create_mirror_view(a);
    mirror_type ax = Kokkos::Experimental::create_mirror(a);
    ASSERT_EQ( & a() , & am() );
    ASSERT_EQ( a.rank() , am.rank() );
    ASSERT_EQ( ax.rank() , am.rank() );
    if (Kokkos::HostSpace::execution_space::is_initialized() )
    {
      Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
      auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
      auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
      int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
      int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
      int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
      ASSERT_EQ(equal_ptr_h_h2,0);
      ASSERT_EQ(equal_ptr_h_d ,0);
      ASSERT_EQ(equal_ptr_h2_d,0);
      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
      ASSERT_EQ(a_h.rank(),a_h2.rank());
      ASSERT_EQ(a_h.rank(),a_d.rank());
    }
    if (Kokkos::HostSpace::execution_space::is_initialized() )
    {
      Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
      auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
      auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
      int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
      int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
      int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
      ASSERT_EQ(equal_ptr_h_h2,0);
      ASSERT_EQ(equal_ptr_h_d ,0);
      ASSERT_EQ(equal_ptr_h2_d,0);
      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
      ASSERT_EQ(a_h.rank(),a_h2.rank());
      ASSERT_EQ(a_h.rank(),a_d.rank());
    }
    if (Kokkos::HostSpace::execution_space::is_initialized() )
    {
      Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
      ASSERT_EQ(equal_ptr_h_h2,1);
      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
      ASSERT_EQ(a_h.rank(),a_h2.rank());
      ASSERT_EQ(a_h.rank(),a_d.rank());
    } 
    if (Kokkos::HostSpace::execution_space::is_initialized() )
    {
      Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
      ASSERT_EQ(equal_ptr_h_h2,1);
      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
      ASSERT_EQ(a_h.rank(),a_h2.rank());
      ASSERT_EQ(a_h.rank(),a_d.rank());
    } 
    if (Kokkos::HostSpace::execution_space::is_initialized() )
    {
      typedef Kokkos::DynRankView< int , Kokkos::LayoutStride , Kokkos::HostSpace > view_stride_type ;
      unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
      view_stride_type a_h( "a" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
      ASSERT_EQ(equal_ptr_h_h2,1);
      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
      ASSERT_EQ(a_h.rank(),a_h2.rank());
      ASSERT_EQ(a_h.rank(),a_d.rank());
    }
  }
  static void run_test_scalar()
  {
-    typedef typename dView0::HostMirror  hView0 ;
+    typedef typename dView0::HostMirror  hView0 ; //HostMirror of DynRankView is a DynRankView
    dView0 dx , dy ;
    hView0 hx , hy ;
@ -765,6 +900,79 @@ public:
    Kokkos::Experimental::deep_copy( hy , dy );
    ASSERT_EQ( hx(), hy() );
    ASSERT_EQ( dx.rank() , hx.rank() );
    ASSERT_EQ( dy.rank() , hy.rank() );
  //View - DynRankView Interoperability tests
  // deep_copy DynRankView to View
    View0 vx("vx");
    Kokkos::deep_copy( vx , dx );
    ASSERT_EQ( rank(dx) , rank(vx) );
    View0 vy("vy");
    Kokkos::deep_copy( vy , dy );
    ASSERT_EQ( rank(dy) , rank(vy) );
  // deep_copy View to DynRankView 
    dView0 dxx("dxx");
    Kokkos::deep_copy( dxx , vx );
    ASSERT_EQ( rank(dxx) , rank(vx) );
    View7 vcast = dx.ConstDownCast();
    ASSERT_EQ( dx.dimension_0() , vcast.dimension_0() );
    ASSERT_EQ( dx.dimension_1() , vcast.dimension_1() );
    ASSERT_EQ( dx.dimension_2() , vcast.dimension_2() );
    ASSERT_EQ( dx.dimension_3() , vcast.dimension_3() );
    ASSERT_EQ( dx.dimension_4() , vcast.dimension_4() );
    View7 vcast1( dy.ConstDownCast() );
    ASSERT_EQ( dy.dimension_0() , vcast1.dimension_0() );
    ASSERT_EQ( dy.dimension_1() , vcast1.dimension_1() );
    ASSERT_EQ( dy.dimension_2() , vcast1.dimension_2() );
    ASSERT_EQ( dy.dimension_3() , vcast1.dimension_3() );
    ASSERT_EQ( dy.dimension_4() , vcast1.dimension_4() );
  //View - DynRankView Interoperability tests
  // copy View to DynRankView
    dView0 dfromvx( vx );
    auto hmx = Kokkos::create_mirror_view(dfromvx) ;
    Kokkos::deep_copy(hmx , dfromvx);
    auto hvx = Kokkos::create_mirror_view(vx) ;
    Kokkos::deep_copy(hvx , vx);
    ASSERT_EQ( rank(hvx) , rank(hmx) );
    ASSERT_EQ( hvx.dimension_0() , hmx.dimension_0() );
    ASSERT_EQ( hvx.dimension_1() , hmx.dimension_1() );
  // copy-assign View to DynRankView
    dView0 dfromvy = vy ;
    auto hmy = Kokkos::create_mirror_view(dfromvy) ;
    Kokkos::deep_copy(hmy , dfromvy);
    auto hvy = Kokkos::create_mirror_view(vy) ;
    Kokkos::deep_copy(hvy , vy);
    ASSERT_EQ( rank(hvy) , rank(hmy) );
    ASSERT_EQ( hvy.dimension_0() , hmy.dimension_0() );
    ASSERT_EQ( hvy.dimension_1() , hmy.dimension_1() );
    View7 vtest1("vtest1",2,2,2,2,2,2,2);
    dView0 dfromv1( vtest1 );
    ASSERT_EQ( dfromv1.rank() , vtest1.Rank );
    ASSERT_EQ( dfromv1.dimension_0() , vtest1.dimension_0() );
    ASSERT_EQ( dfromv1.dimension_1() , vtest1.dimension_1() );
    ASSERT_EQ( dfromv1.use_count() , vtest1.use_count() );
    dView0 dfromv2( vcast );
    ASSERT_EQ( dfromv2.rank() , vcast.Rank );
    ASSERT_EQ( dfromv2.dimension_0() , vcast.dimension_0() );
    ASSERT_EQ( dfromv2.dimension_1() , vcast.dimension_1() );
    ASSERT_EQ( dfromv2.use_count() , vcast.use_count() );
    dView0 dfromv3 = vcast1;
    ASSERT_EQ( dfromv3.rank() , vcast1.Rank );
    ASSERT_EQ( dfromv3.dimension_0() , vcast1.dimension_0() );
    ASSERT_EQ( dfromv3.dimension_1() , vcast1.dimension_1() );
    ASSERT_EQ( dfromv3.use_count() , vcast1.use_count() );
  }
  static void run_test()
@ -782,22 +990,32 @@ public:
      (void) thing;
    }
    dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"),10,20);
    ASSERT_TRUE( d_uninitialized.data() != nullptr );
    ASSERT_EQ( d_uninitialized.rank() , 2 );
    ASSERT_EQ( d_uninitialized.dimension_0() , 10 );
    ASSERT_EQ( d_uninitialized.dimension_1() , 20 );
    ASSERT_EQ( d_uninitialized.dimension_2() , 1  );
    dView0 dx , dy , dz ;
    hView0 hx , hy , hz ;
-    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_TRUE( Kokkos::Experimental::is_dyn_rank_view<dView0>::value );
-    ASSERT_TRUE( dy.ptr_on_device() == 0 );
+    ASSERT_FALSE( Kokkos::Experimental::is_dyn_rank_view< Kokkos::View<double> >::value );
-    ASSERT_TRUE( dz.ptr_on_device() == 0 );
+
    ASSERT_TRUE( dx.ptr_on_device() == 0 ); //Okay with UVM
    ASSERT_TRUE( dy.ptr_on_device() == 0 );  //Okay with UVM
    ASSERT_TRUE( dz.ptr_on_device() == 0 ); //Okay with UVM
    ASSERT_TRUE( hx.ptr_on_device() == 0 );
    ASSERT_TRUE( hy.ptr_on_device() == 0 );
    ASSERT_TRUE( hz.ptr_on_device() == 0 );
-    ASSERT_EQ( dx.dimension_0() , 0u );
+    ASSERT_EQ( dx.dimension_0() , 0u ); //Okay with UVM
-    ASSERT_EQ( dy.dimension_0() , 0u );
+    ASSERT_EQ( dy.dimension_0() , 0u ); //Okay with UVM
-    ASSERT_EQ( dz.dimension_0() , 0u );
+    ASSERT_EQ( dz.dimension_0() , 0u ); //Okay with UVM
    ASSERT_EQ( hx.dimension_0() , 0u );
    ASSERT_EQ( hy.dimension_0() , 0u );
    ASSERT_EQ( hz.dimension_0() , 0u );
-    ASSERT_EQ( dx.rank() , 0u );
+    ASSERT_EQ( dx.rank() , 0u ); //Okay with UVM
    ASSERT_EQ( hx.rank() , 0u );
    dx = dView0( "dx" , N1 , N2 , N3 );
@ -806,11 +1024,11 @@ public:
    hx = hView0( "hx" , N1 , N2 , N3 );
    hy = hView0( "hy" , N1 , N2 , N3 );
-    ASSERT_EQ( dx.dimension_0() , unsigned(N1) );
+    ASSERT_EQ( dx.dimension_0() , unsigned(N1) ); //Okay with UVM
-    ASSERT_EQ( dy.dimension_0() , unsigned(N1) );
+    ASSERT_EQ( dy.dimension_0() , unsigned(N1) ); //Okay with UVM
    ASSERT_EQ( hx.dimension_0() , unsigned(N1) );
    ASSERT_EQ( hy.dimension_0() , unsigned(N1) );
-    ASSERT_EQ( dx.rank() , 3 );
+    ASSERT_EQ( dx.rank() , 3 ); //Okay with UVM
    ASSERT_EQ( hx.rank() , 3 );
    dx = dView0( "dx" , N0 , N1 , N2 , N3 );
@ -823,19 +1041,23 @@ public:
    ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
    ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
    ASSERT_EQ( dx.rank() , 4 );
    ASSERT_EQ( dy.rank() , 4 );
    ASSERT_EQ( hx.rank() , 4 );
    ASSERT_EQ( hy.rank() , 4 );
    ASSERT_EQ( dx.use_count() , size_t(1) );
    dView0_unmanaged unmanaged_dx = dx;
    ASSERT_EQ( dx.use_count() , size_t(1) );
    dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged(dx.ptr_on_device(),
                                                              dx.dimension_0(),
                                                              dx.dimension_1(),
                                                              dx.dimension_2(),
                                                              dx.dimension_3());
    {
      // Destruction of this view should be harmless
      const_dView0 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
@ -888,6 +1110,19 @@ public:
    hx = Kokkos::Experimental::create_mirror( dx );
    hy = Kokkos::Experimental::create_mirror( dy );
    ASSERT_EQ( hx.rank() , dx.rank() );
    ASSERT_EQ( hy.rank() , dy.rank() );
    ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
    ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
    ASSERT_EQ( hx.dimension_2() , unsigned(N2) );
    ASSERT_EQ( hx.dimension_3() , unsigned(N3) );
    ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
    ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
    ASSERT_EQ( hy.dimension_2() , unsigned(N2) );
    ASSERT_EQ( hy.dimension_3() , unsigned(N3) );
    // T v1 = hx() ;    // Generates compile error as intended
    // T v2 = hx(0,0) ; // Generates compile error as intended
    // hx(0,0) = v2 ;   // Generates compile error as intended
@ -990,7 +1225,9 @@ public:
      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
      }}}}
 //    ASSERT_EQ( hx(0,0,0,0,0,0,0,0) , T(0) ); //Test rank8 op behaves properly - if implemented
    }
    dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
    dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
@ -1006,6 +1243,35 @@ public:
    ASSERT_TRUE( dx.ptr_on_device() == 0 );
    ASSERT_TRUE( dy.ptr_on_device() == 0 );
    ASSERT_TRUE( dz.ptr_on_device() == 0 );
  //View - DynRankView Interoperability tests
    // deep_copy from view to dynrankview
    const int testdim = 4;
    dView0 dxx("dxx",testdim);
    View1  vxx("vxx",testdim);
    auto hvxx = Kokkos::create_mirror_view(vxx); 
    for (int i = 0; i < testdim; ++i)
      { hvxx(i) = i; }
    Kokkos::deep_copy(vxx,hvxx);
    Kokkos::deep_copy(dxx,vxx);
    auto hdxx = Kokkos::create_mirror_view(dxx);
    Kokkos::deep_copy(hdxx,dxx);
    for (int i = 0; i < testdim; ++i)
      { ASSERT_EQ( hvxx(i) , hdxx(i) ); }
    ASSERT_EQ( rank(hdxx) , rank(hvxx) );
    ASSERT_EQ( hdxx.dimension_0() , testdim );
    ASSERT_EQ( hdxx.dimension_0() , hvxx.dimension_0() );
    // deep_copy from dynrankview to view
    View1 vdxx("vdxx",testdim);
    auto hvdxx = Kokkos::create_mirror_view(vdxx);
    Kokkos::deep_copy(hvdxx , hdxx);
    ASSERT_EQ( rank(hdxx) , rank(hvdxx) );
    ASSERT_EQ( hvdxx.dimension_0() , testdim );
    ASSERT_EQ( hdxx.dimension_0() , hvdxx.dimension_0() );
    for (int i = 0; i < testdim; ++i)
      { ASSERT_EQ( hvxx(i) , hvdxx(i) ); }
  }
  typedef T DataType ;
@ -1059,35 +1325,66 @@ public:
  //  N0 = 1000,N1 = 3,N2 = 5,N3 = 7 
    unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
    sdView d7( "d7" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
    ASSERT_EQ( d7.rank() , 7 );
-    sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); //Should be rank0 subview
+    sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); 
    ASSERT_EQ( ds0.rank() , 0 );
 //Basic test - ALL
-    sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); //compiles and runs
+    sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); 
    ASSERT_EQ( dsALL.rank() , 7 );
-//  Send a single value for one rank
+//  Send a value to final rank returning rank 6 subview
    sdView dsm1 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 1 );
    ASSERT_EQ( dsm1.rank() , 6 );
-//  Send a std::pair as a rank
+//  Send a std::pair as argument to a rank
    sdView dssp = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , std::pair<unsigned,unsigned>(1,2) );
    ASSERT_EQ( dssp.rank() , 7 );
-//  Send a kokkos::pair as a rank; take default layout as input
+//  Send a kokkos::pair as argument to a rank; take default layout as input
    dView0 dd0("dd0" , N0 , N1 , N2 , 2 , 2 , 2 , 2 ); //default layout
    ASSERT_EQ( dd0.rank() , 7 );
    sdView dtkp = Kokkos::Experimental::subdynrankview( dd0 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
    ASSERT_EQ( dtkp.rank() , 7 );
 // Return rank 7 subview, taking a pair as one argument, layout stride input
    sdView ds7 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
    ASSERT_EQ( ds7.rank() , 7 );
 // Default Layout DynRankView
    dView dv6("dv6" , N0 , N1 , N2 , N3 , 2 , 2 );
    ASSERT_EQ( dv6.rank() , 6 );
 // DynRankView with LayoutRight
    typedef Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , device > drView ;
    drView dr5( "dr5" , N0 , N1 , N2 , 2 , 2 );
    ASSERT_EQ( dr5.rank() , 5 );
 // LayoutStride but arranged as LayoutRight
-    unsigned order3[] = { 4,3,2,1,0 }, dimen3[] = { N0, N1, N2, 2, 2 };
+  // NOTE: unused arg_layout dimensions must be set to ~size_t(0) so that 
-    sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order3, dimen3) );
+  //  rank deduction can properly take place
    unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
    Kokkos::LayoutStride ls = Kokkos::LayoutStride::order_dimensions(5, order5, dimen5);
    ls.dimension[5] = ~size_t(0);
    ls.dimension[6] = ~size_t(0);
    ls.dimension[7] = ~size_t(0);
    sdView d5("d5", ls);
    ASSERT_EQ( d5.rank() , 5 );
 //  LayoutStride arranged as LayoutRight - commented out as example that fails unit test
 //    unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
 //    sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order5, dimen5) );
 //
 //  Fails the following unit test:
 //    ASSERT_EQ( d5.rank() , dr5.rank() );
 //
 //  Explanation: In construction of the Kokkos::LayoutStride below, since the 
 //   remaining dimensions are not specified, they will default to values of 0 
 //   rather than ~size_t(0). 
 //  When passed to the DynRankView constructor the default dimensions (of 0) 
 //   will be counted toward the dynamic rank and returning an incorrect value 
 //   (i.e. rank 7 rather than 5).
 // Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they should) 
    ASSERT_EQ( d5.dimension_0() , dr5.dimension_0() );
@ -1100,21 +1397,21 @@ public:
 // Rank 5 subview of rank 5 dynamic rank view, layout stride input
    sdView ds5 = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
    ASSERT_EQ( ds5.rank() , 5 );
 // Pass in extra ALL arguments beyond the rank of the DynRank View.
 // This behavior is allowed - ignore the extra ALL arguments when
 //  the src.rank() < number of arguments, but be careful!
    sdView ds5plus = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) , Kokkos::ALL() );
    ASSERT_EQ( ds5.rank() , ds5plus.rank() );
    ASSERT_EQ( ds5.dimension_0() , ds5plus.dimension_0() );
    ASSERT_EQ( ds5.dimension_4() , ds5plus.dimension_4() );
    ASSERT_EQ( ds5.dimension_5() , ds5plus.dimension_5() );
    ASSERT_EQ( ds5.rank() , ds5plus.rank() );
    ASSERT_EQ( ds5.rank() , 5 );
 #if ! defined( KOKKOS_HAVE_CUDA ) || defined ( KOKKOS_USE_CUDA_UVM )
    ASSERT_EQ( & ds5(1,1,1,1) - & ds5plus(1,1,1,1) , 0 );
    ASSERT_EQ( & ds5(1,1,1,1,0) - & ds5plus(1,1,1,1,0) , 0 );
    ASSERT_EQ( & ds5(1,1,1,1,0,0) - & ds5plus(1,1,1,1,0,0) , 0 );  // passing argument to rank beyond the view's rank is allowed iff it is a 0. 
 #endif
 // Similar test to rank 5 above, but create rank 4 subview
@ -1131,9 +1428,9 @@ public:
  static void run_test_subview_strided()
  {
-    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host > drview_left ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host_drv_space > drview_left ;
-    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host > drview_right ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host_drv_space > drview_right ;
-    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host > drview_stride ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host_drv_space > drview_stride ;
    drview_left  xl2( "xl2", 100 , 200 );
    drview_right xr2( "xr2", 100 , 200 );
@ -1159,35 +1456,37 @@ public:
    drview_left  xl4( "xl4", 10 , 20 , 30 , 40 );
    drview_right xr4( "xr4", 10 , 20 , 30 , 40 );
-    drview_stride yl4 = Kokkos::Experimental::subdynrankview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+    //Replace subdynrankview with subview - test
-    drview_stride yr4 = Kokkos::Experimental::subdynrankview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+    drview_stride yl4 = Kokkos::Experimental::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
    drview_stride yr4 = Kokkos::Experimental::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
    ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
    ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
    ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
    ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
    ASSERT_EQ( yl4.rank() , 2);
    ASSERT_EQ( yr4.rank() , 2);
    ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
    ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
  }
  static void run_test_vector()
  {
    static const unsigned Length = 1000 , Count = 8 ;
-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host > multivector_type ; 
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host_drv_space > multivector_type ; 
-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host > multivector_right_type ;
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host_drv_space > multivector_right_type ;
    multivector_type mv = multivector_type( "mv" , Length , Count );
    multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > svector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > svector_type ;
-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > smultivector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > smultivector_type ;
-    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_right_type ; //LayoutStride, not right; setup to match original ViewAPI calls... update
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_right_type ; 
-    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_type ;
-    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_smultivector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_smultivector_type ;
    svector_type v1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 );
    svector_type v2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 );
@ -1251,7 +1550,6 @@ public:
    const_smultivector_type cmv( mv );
    typename smultivector_type::const_type cmvX( cmv );
    typename const_smultivector_type::const_type ccmvX( cmv );
  }
 };
--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@ -61,8 +61,7 @@ struct TestDynamicView
  typedef typename Space::execution_space  execution_space ;
  typedef typename Space::memory_space     memory_space ;
-  typedef Kokkos::Experimental::MemoryPool< memory_space , execution_space >
+  typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;
    memory_pool_type ;
  typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
@ -129,11 +128,9 @@ struct TestDynamicView
    typedef Kokkos::TeamPolicy<execution_space,TEST> TestPolicy ;
    typedef Kokkos::TeamPolicy<execution_space,VERIFY> VerifyPolicy ;
    const unsigned int chunk_size = 1024 ;
 // printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size);
-    memory_pool_type pool( memory_space() , chunk_size , arg_total_size * sizeof(Scalar) );
+    memory_pool_type pool( memory_space() , arg_total_size * sizeof(Scalar) * 1.2 );
 // printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size);
--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in
@ -34,6 +34,7 @@
 #cmakedefine KOKKOS_HAVE_Winthread
 #cmakedefine KOKKOS_HAVE_OPENMP
 #cmakedefine KOKKOS_HAVE_HWLOC
 #cmakedefine KOKKOS_HAVE_DEBUG
 #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
 #cmakedefine KOKKOS_HAVE_CXX11
 #cmakedefine KOKKOS_HAVE_CUSPARSE
--- a/lib/kokkos/core/perf_test/CMakeLists.txt
+++ b/lib/kokkos/core/perf_test/CMakeLists.txt
@ -8,11 +8,22 @@ SET(SOURCES
  PerfTestCuda.cpp
  )
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+# Per #374, we always want to build this test, but we only want to run
-  PerfTest
+# it as a PERFORMANCE test.  That's why we separate building the test
 # from running the test.
 TRIBITS_ADD_EXECUTABLE(
  PerfTestExec
  SOURCES ${SOURCES}
  COMM serial mpi
  NUM_MPI_PROCS 1
  FAIL_REGULAR_EXPRESSION "  FAILED  "
  TESTONLYLIBS kokkos_gtest
  )
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
  PerfTest
  NAME PerfTestExec
  COMM serial mpi
  NUM_MPI_PROCS 1
  CATEGORIES PERFORMANCE
  FAIL_REGULAR_EXPRESSION "  FAILED  "
  )
--- a/lib/kokkos/core/perf_test/PerfTestCuda.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
@ -159,7 +159,7 @@ struct TextureFetch
    Kokkos::Cuda::fence();
-    Kokkos::Impl::Timer timer;
+    Kokkos::Timer timer;
    for (int j=0; j<10; ++j) {
      RandomReduce f(array,indexes);
      f.apply(reduce);
--- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
@ -153,7 +153,7 @@ struct ModifiedGramSchmidt
    Kokkos::deep_copy( one , (Scalar) 1 );
-    Kokkos::Impl::Timer timer ;
+    Kokkos::Timer timer ;
    for ( size_type j = 0 ; j < count ; ++j ) {
      // Reduction   : tmp = dot( Q(:,j) , Q(:,j) );
--- a/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
@ -252,7 +252,7 @@ struct HexGrad
    execution_space::fence();
    for ( int i = 0 ; i < iter ; ++i ) {
-      Kokkos::Impl::Timer timer ;
+      Kokkos::Timer timer ;
      Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
      execution_space::fence();
      const double dt = timer.seconds();
--- a/lib/kokkos/core/perf_test/test_atomic.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@ -414,24 +414,27 @@ void Loop(int loop, int test, const char* type_name) {
  Kokkos::Impl::Timer timer;
  T res = LoopVariant<T>(loop,test);
-  double time1 = timer.seconds();
+  double time = timer.seconds();
  timer.reset();
  T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
-  double time2 = timer.seconds();
+  double timeNonAtomic = timer.seconds();
  timer.reset();
  T resSerial = LoopVariantSerial<T>(loop,test);
-  double time3 = timer.seconds();
+  double timeSerial = timer.seconds();
-  time1*=1e6/loop;
+  time         *=1e6/loop;
-  time2*=1e6/loop;
+  timeNonAtomic*=1e6/loop;
-  time3*=1e6/loop;
+  timeSerial   *=1e6/loop;
  //textcolor_standard();
  bool passed = true;
  if(resSerial!=res) passed = false;
  //if(!passed) textcolor(RESET,BLACK,YELLOW);
-  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T));
+  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",
         type_name,test,passed?"PASSED":"FAILED",loop,
         1.0*resSerial,1.0*res,1.0*resNonAtomic,
         timeSerial,time,timeNonAtomic,(int)sizeof(T));
  //if(!passed) textcolor_standard();
  printf("\n");
 }
@ -452,7 +455,7 @@ void Test(int loop, int test, const char* type_name) {
 int main(int argc, char* argv[])
 {
  int type = -1;
-  int loop = 1000000;
+  int loop = 100000;
  int test = -1;
  for(int i=0;i<argc;i++)
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -124,15 +124,31 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
 #endif
 namespace Kokkos {
 namespace Impl {
  struct CudaLockArraysStruct {
    int* atomic;
    int* scratch;
    int* threadid;
  };
 }
 }
 __device__ __constant__
 #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
 extern
 #endif
-int* kokkos_impl_cuda_atomic_lock_array ;
+Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
 #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
 #define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
 namespace Kokkos {
 namespace Impl {
  void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
 }
 }
 namespace Kokkos {
 namespace Impl {
 __device__ inline
@ -140,8 +156,7 @@ bool lock_address_cuda_space(void* ptr) {
  size_t offset = size_t(ptr);
  offset = offset >> 2;
  offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
+  return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
  return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1));
 }
 __device__ inline
@ -149,8 +164,7 @@ void unlock_address_cuda_space(void* ptr) {
  size_t offset = size_t(ptr);
  offset = offset >> 2;
  offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
+  atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
  atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0);
 }
 }
@ -232,8 +246,11 @@ struct CudaParallelLaunch< DriverType , true > {
      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
      #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-      int* lock_array_ptr = lock_array_cuda_space_ptr();
+      Kokkos::Impl::CudaLockArraysStruct locks;
-      cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
      #endif
      // Invoke the driver function on the device
@ -271,8 +288,11 @@ struct CudaParallelLaunch< DriverType , false > {
      #endif
      #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-      int* lock_array_ptr = lock_array_cuda_space_ptr();
+      Kokkos::Impl::CudaLockArraysStruct locks;
-      cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
      #endif
      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -51,10 +51,10 @@
 /* only compile this file if CUDA is enabled for Kokkos */
 #ifdef KOKKOS_HAVE_CUDA
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Cuda.hpp>
 #include <Kokkos_CudaSpace.hpp>
 #include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <impl/Kokkos_Error.hpp>
@ -107,68 +107,6 @@ void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {
 namespace Kokkos {
 #if ! KOKKOS_USING_EXP_VIEW
 namespace {
 void texture_object_attach_impl(  Impl::AllocationTracker const & tracker
                                , unsigned type_size
                                , ::cudaChannelFormatDesc const & desc
                               )
 {
  enum { TEXTURE_BOUND_1D = 2u << 27 };
  if ( tracker.attribute() == NULL ) {
    // check for correct allocator
    const bool ok_alloc =  tracker.allocator()->support_texture_binding();
    const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D;
    if (ok_alloc && ok_count) {
      Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc );
      tracker.set_attribute( attr );
    }
    else {
      std::ostringstream oss;
      oss << "Error: Cannot attach texture object";
      if (!ok_alloc) {
        oss << ", incompatabile allocator " << tracker.allocator()->name();
      }
      if (!ok_count) {
        oss << ", array " << tracker.label() << " too large";
      }
      oss << ".";
      Kokkos::Impl::throw_runtime_exception( oss.str() );
    }
  }
  if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) {
    std::ostringstream oss;
    oss << "Error: Allocation " << tracker.label() << " already has an attribute attached.";
    Kokkos::Impl::throw_runtime_exception( oss.str() );
  }
 }
 } // unnamed namespace
 /*--------------------------------------------------------------------------*/
 Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size )
 {
  return Impl::AllocationTracker( allocator(), size, label);
 }
 void CudaSpace::texture_object_attach(  Impl::AllocationTracker const & tracker
                                      , unsigned type_size
                                      , ::cudaChannelFormatDesc const & desc
                                     )
 {
  texture_object_attach_impl( tracker, type_size, desc );
 }
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
 void CudaSpace::access_error()
 {
  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
@ -183,23 +121,6 @@ void CudaSpace::access_error( const void * const )
 /*--------------------------------------------------------------------------*/
 #if ! KOKKOS_USING_EXP_VIEW
 Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
 {
  return Impl::AllocationTracker( allocator(), size, label);
 }
 void CudaUVMSpace::texture_object_attach(  Impl::AllocationTracker const & tracker
                                         , unsigned type_size
                                         , ::cudaChannelFormatDesc const & desc
                                        )
 {
  texture_object_attach_impl( tracker, type_size, desc );
 }
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
 bool CudaUVMSpace::available()
 {
 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
@ -212,15 +133,6 @@ bool CudaUVMSpace::available()
 /*--------------------------------------------------------------------------*/
 #if ! KOKKOS_USING_EXP_VIEW
 Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
 {
  return Impl::AllocationTracker( allocator(), size, label);
 }
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
 } // namespace Kokkos
 /*--------------------------------------------------------------------------*/
@ -824,16 +736,26 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
 namespace Kokkos {
 namespace {
-  __global__ void init_lock_array_kernel() {
+  __global__ void init_lock_array_kernel_atomic() {
    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
    if(i<CUDA_SPACE_ATOMIC_MASK+1)
-      kokkos_impl_cuda_atomic_lock_array[i] = 0;
+      kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
  }
  __global__ void init_lock_array_kernel_scratch_threadid(int N) {
    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
    if(i<N) {
      kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
      kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
    }
  }
 }
 namespace Impl {
-int* lock_array_cuda_space_ptr(bool deallocate) {
+int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
  static int* ptr = NULL;
  if(deallocate) {
    cudaFree(ptr);
@ -845,13 +767,60 @@ int* lock_array_cuda_space_ptr(bool deallocate) {
  return ptr;
 }
-void init_lock_array_cuda_space() {
+int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
-  int is_initialized = 0;
+  static int* ptr = NULL;
-  if(! is_initialized) {
+  if(deallocate) {
-    int* lock_array_ptr = lock_array_cuda_space_ptr();
+    cudaFree(ptr);
-    cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+    ptr = NULL;
    init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
  }
  if(ptr==NULL && !deallocate)
    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
  return ptr;
 }
 int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
  static int* ptr = NULL;
  if(deallocate) {
    cudaFree(ptr);
    ptr = NULL;
  }
  if(ptr==NULL && !deallocate)
    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
  return ptr;
 }
 void init_lock_arrays_cuda_space() {
  static int is_initialized = 0;
  if(! is_initialized) {
    Kokkos::Impl::CudaLockArraysStruct locks;
    locks.atomic = atomic_lock_array_cuda_space_ptr(false);
    locks.scratch = scratch_lock_array_cuda_space_ptr(false);
    locks.threadid = threadid_lock_array_cuda_space_ptr(false);
    cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
    init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
    init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
  }
 }
 void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
  static void* ptr = NULL;
  static size_t current_size = 0;
  if(current_size == 0) {
    current_size = bytes;
    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
  }
  if(bytes > current_size) {
    current_size = bytes;
    ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size);
  }
  if((bytes < current_size) && (force_shrink)) {
    current_size = bytes;
    Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr);
    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
  }
  return ptr;
 }
 }
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
@ -50,7 +50,6 @@
 #ifdef KOKKOS_HAVE_CUDA
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
 namespace Kokkos {
 namespace Impl {
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
@ -1,198 +0,0 @@
 /*
 //@HEADER
 // ************************************************************************
 // 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 // 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 // 
 // ************************************************************************
 //@HEADER
 */
 #include <Kokkos_Macros.hpp>
 #if ! KOKKOS_USING_EXP_VIEW
 /* only compile this file if CUDA is enabled for Kokkos */
 #ifdef KOKKOS_HAVE_CUDA
 #include <impl/Kokkos_Error.hpp>
 #include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <sstream>
 namespace Kokkos { namespace Impl {
 /*--------------------------------------------------------------------------*/
 TextureAttribute::TextureAttribute(  void * const alloc_ptr
                                   , size_t alloc_size
                                   , cudaChannelFormatDesc const & desc
                                  )
  : m_tex_obj(0)
 {
  cuda_device_synchronize();
  struct cudaResourceDesc resDesc ;
  struct cudaTextureDesc  texDesc ;
  memset( & resDesc , 0 , sizeof(resDesc) );
  memset( & texDesc , 0 , sizeof(texDesc) );
  resDesc.resType                = cudaResourceTypeLinear ;
  resDesc.res.linear.desc        = desc ;
  resDesc.res.linear.sizeInBytes = alloc_size ;
  resDesc.res.linear.devPtr      = alloc_ptr ;
  CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) );
  cuda_device_synchronize();
 }
 TextureAttribute::~TextureAttribute()
 {
  if (m_tex_obj) {
    cudaDestroyTextureObject( m_tex_obj );
  }
 }
 /*--------------------------------------------------------------------------*/
 void * CudaMallocAllocator::allocate( size_t size )
 {
  void * ptr = NULL;
  CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) );
  return ptr;
 }
 void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
 {
  try {
    CUDA_SAFE_CALL( cudaFree( ptr ) );
  } catch(...) {}
 }
 void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
 {
  void * ptr = old_ptr;
  if (old_size != new_size) {
    ptr = allocate( new_size );
    size_t copy_size = old_size < new_size ? old_size : new_size;
    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
    deallocate( old_ptr, old_size );
  }
  return ptr;
 }
 /*--------------------------------------------------------------------------*/
 void * CudaUVMAllocator::allocate( size_t size )
 {
 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
  void * ptr = NULL;
  CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) );
  return ptr;
 #else
  throw_runtime_exception( "CUDA VERSION does not support UVM" );
  return NULL;
 #endif
 }
 void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ )
 {
  try {
    CUDA_SAFE_CALL( cudaFree( ptr ) );
  } catch(...) {}
 }
 void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
 {
  void * ptr = old_ptr;
  if (old_size != new_size) {
    ptr = allocate( new_size );
    size_t copy_size = old_size < new_size ? old_size : new_size;
    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
    deallocate( old_ptr, old_size );
  }
  return ptr;
 }
 /*--------------------------------------------------------------------------*/
 void * CudaHostAllocator::allocate( size_t size )
 {
  void * ptr = NULL;
  CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) );
  return ptr;
 }
 void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ )
 {
  try {
    CUDA_SAFE_CALL( cudaFreeHost( ptr ) );
  } catch(...) {}
 }
 void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
 {
  void * ptr = old_ptr;
  if (old_size != new_size) {
    ptr = allocate( new_size );
    size_t copy_size = old_size < new_size ? old_size : new_size;
    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) );
    deallocate( old_ptr, old_size );
  }
  return ptr;
 }
 /*--------------------------------------------------------------------------*/
 }} // namespace Kokkos::Impl
 #endif //KOKKOS_HAVE_CUDA
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
@ -1,190 +0,0 @@
 /*
 //@HEADER
 // ************************************************************************
 // 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 // 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 // 
 // ************************************************************************
 //@HEADER
 */
 #ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
 #define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
 #include <Kokkos_Macros.hpp>
 #if ! KOKKOS_USING_EXP_VIEW
 /* only compile this file if CUDA is enabled for Kokkos */
 #ifdef KOKKOS_HAVE_CUDA
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
 namespace Kokkos { namespace Impl {
 // Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
 // to be an 'unsigned long long'.  This chould change with
 // future version of Cuda and this typedef would have to
 // change accordingly.
 #if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
 typedef enable_if<
  sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
  ::cudaTextureObject_t >::type cuda_texture_object_type ;
 #else
 typedef const void * cuda_texture_object_type ;
 #endif
 struct TextureAttribute : public AllocatorAttributeBase
 {
  cuda_texture_object_type m_tex_obj ;
  TextureAttribute(  void * const alloc_ptr
                   , size_t alloc_size
                   , cudaChannelFormatDesc const & desc
                  );
  ~TextureAttribute();
 };
 /// class CudaUnmanagedAllocator
 /// does nothing when deallocate(ptr,size) is called
 struct CudaUnmanagedAllocator
 {
  static const char * name()
  {
    return "Cuda Unmanaged Allocator";
  }
  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
  static bool support_texture_binding() { return true; }
 };
 /// class CudaUnmanagedAllocator
 /// does nothing when deallocate(ptr,size) is called
 struct CudaUnmanagedUVMAllocator
 {
  static const char * name()
  {
    return "Cuda Unmanaged UVM Allocator";
  }
  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
  static bool support_texture_binding() { return true; }
 };
 /// class CudaUnmanagedHostAllocator
 /// does nothing when deallocate(ptr,size) is called
 class CudaUnmanagedHostAllocator
 {
 public:
  static const char * name()
  {
    return "Cuda Unmanaged Host Allocator";
  }
  // Unmanaged deallocate does nothing
  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
 };
 /// class CudaMallocAllocator
 class CudaMallocAllocator
 {
 public:
  static const char * name()
  {
    return "Cuda Malloc Allocator";
  }
  static void* allocate(size_t size);
  static void deallocate(void * ptr, size_t);
  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
  static bool support_texture_binding() { return true; }
 };
 /// class CudaUVMAllocator
 class CudaUVMAllocator
 {
 public:
  static const char * name()
  {
    return "Cuda UVM Allocator";
  }
  static void* allocate(size_t size);
  static void deallocate(void * ptr, size_t);
  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
  static bool support_texture_binding() { return true; }
 };
 /// class CudaHostAllocator
 class CudaHostAllocator
 {
 public:
  static const char * name()
  {
    return "Cuda Host Allocator";
  }
  static void* allocate(size_t size);
  static void deallocate(void * ptr, size_t);
  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
 };
 }} // namespace Kokkos::Impl
 #endif //KOKKOS_HAVE_CUDA
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
 #endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -51,8 +51,8 @@
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <impl/Kokkos_AllocationTracker.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 /*--------------------------------------------------------------------------*/
 /* Standard 'C' libraries */
@ -70,7 +70,7 @@ __device__ __constant__
 unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
 __device__ __constant__
-int* kokkos_impl_cuda_atomic_lock_array ;
+Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
 #endif
@ -190,7 +190,7 @@ namespace {
 class CudaInternalDevices {
 public:
-  enum { MAXIMUM_DEVICE_COUNT = 8 };
+  enum { MAXIMUM_DEVICE_COUNT = 64 };
  struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
  int                    m_cudaDevCount ;
@ -206,6 +206,9 @@ CudaInternalDevices::CudaInternalDevices()
  CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
  if(m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
    Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
  }
  for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
    CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
  }
@ -226,14 +229,6 @@ private:
  CudaInternal( const CudaInternal & );
  CudaInternal & operator = ( const CudaInternal & );
 #if ! KOKKOS_USING_EXP_VIEW
  AllocationTracker m_scratchFlagsTracker;
  AllocationTracker m_scratchSpaceTracker;
  AllocationTracker m_scratchUnifiedTracker;
 #endif
 public:
@ -255,6 +250,8 @@ public:
  size_type * m_scratchUnified ;
  cudaStream_t * m_stream ;
  static int was_initialized;
  static int was_finalized;
  static CudaInternal & singleton();
@ -293,6 +290,8 @@ public:
  size_type * scratch_unified( const size_type size );
 };
 int CudaInternal::was_initialized = 0;
 int CudaInternal::was_finalized = 0;
 //----------------------------------------------------------------------------
@ -367,6 +366,10 @@ CudaInternal & CudaInternal::singleton()
 void CudaInternal::initialize( int cuda_device_id , int stream_count )
 {
  if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
  was_initialized = 1;
  if ( is_initialized() ) return;
  enum { WordSize = sizeof(size_type) };
  if ( ! HostSpace::execution_space::is_initialized() ) {
@ -526,11 +529,14 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
  // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_array_cuda_space();
+  Impl::init_lock_arrays_cuda_space();
  #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-  int* lock_array_ptr = lock_array_cuda_space_ptr();
+  Kokkos::Impl::CudaLockArraysStruct locks;
-  cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+  locks.atomic = atomic_lock_array_cuda_space_ptr(false);
  locks.scratch = scratch_lock_array_cuda_space_ptr(false);
  locks.threadid = threadid_lock_array_cuda_space_ptr(false);
  cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
  #endif
 }
@ -548,14 +554,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
 #if ! KOKKOS_USING_EXP_VIEW
    m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
    m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
 #else
    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
    Record * const r = Record::allocate( Kokkos::CudaSpace()
@ -566,9 +564,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
    m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
 #endif
    CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
  }
@ -582,14 +577,6 @@ CudaInternal::scratch_space( const Cuda::size_type size )
    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
 #if ! KOKKOS_USING_EXP_VIEW
    m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
    m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
 #else
     typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
     Record * const r = Record::allocate( Kokkos::CudaSpace()
@ -599,9 +586,6 @@ CudaInternal::scratch_space( const Cuda::size_type size )
     Record::increment( r );
     m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
 #endif
  }
  return m_scratchSpace ;
@ -615,14 +599,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
    m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
 #if ! KOKKOS_USING_EXP_VIEW
    m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
    m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
 #else
    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
    Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
@ -632,9 +608,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
    Record::increment( r );
    m_scratchUnified = reinterpret_cast<size_type *>( r->data() );
 #endif
  }
  return m_scratchUnified ;
@ -644,9 +617,13 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
 void CudaInternal::finalize()
 {
  was_finalized = 1;
  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
-    lock_array_cuda_space_ptr(true);
+    atomic_lock_array_cuda_space_ptr(false);
    scratch_lock_array_cuda_space_ptr(false);
    threadid_lock_array_cuda_space_ptr(false);
    if ( m_stream ) {
      for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
        cudaStreamDestroy( m_stream[i] );
@ -655,14 +632,6 @@ void CudaInternal::finalize()
      ::free( m_stream );
    }
 #if ! KOKKOS_USING_EXP_VIEW
    m_scratchSpaceTracker.clear();
    m_scratchFlagsTracker.clear();
    m_scratchUnifiedTracker.clear();
 #else
    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
@ -670,8 +639,6 @@ void CudaInternal::finalize()
    RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
    RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
 #endif
    m_cudaDev             = -1 ;
    m_multiProcCount      = 0 ;
    m_maxWarpCount        = 0 ;
@ -730,7 +697,13 @@ int Cuda::is_initialized()
 { return Impl::CudaInternal::singleton().is_initialized(); }
 void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
-{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); }
+{
  Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
  #if (KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
  #endif
 }
 std::vector<unsigned>
 Cuda::detect_device_arch()
@ -763,7 +736,13 @@ Cuda::size_type Cuda::device_arch()
 }
 void Cuda::finalize()
-{ Impl::CudaInternal::singleton().finalize(); }
+{
  Impl::CudaInternal::singleton().finalize();
  #if (KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
  #endif
 }
 Cuda::Cuda()
  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@ -57,17 +57,20 @@ template<class DriverType, bool Large>
 struct CudaGetMaxBlockSize;
 template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
-int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
-  return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra);
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
  return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
 }
 template<class DriverType>
 struct CudaGetMaxBlockSize<DriverType,true> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int numBlocks;
    int blockSize=32;
-    int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocks,
        cuda_parallel_launch_constant_memory<DriverType>,
@ -76,7 +79,8 @@ struct CudaGetMaxBlockSize<DriverType,true> {
    while (blockSize<1024 && numBlocks>0) {
      blockSize*=2;
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length);
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
          &numBlocks,
@ -91,11 +95,13 @@ struct CudaGetMaxBlockSize<DriverType,true> {
 template<class DriverType>
 struct CudaGetMaxBlockSize<DriverType,false> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int numBlocks;
    int blockSize=32;
-    int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocks,
        cuda_parallel_launch_local_memory<DriverType>,
@ -104,7 +110,8 @@ struct CudaGetMaxBlockSize<DriverType,false> {
    while (blockSize<1024 && numBlocks>0) {
      blockSize*=2;
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
          &numBlocks,
@ -123,13 +130,15 @@ template<class DriverType, bool Large>
 struct CudaGetOptBlockSize;
 template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
-int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
-  return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra);
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
  return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
 }
 template<class DriverType>
 struct CudaGetOptBlockSize<DriverType,true> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
    int numBlocks;
    int sharedmem;
@ -140,7 +149,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
      blockSize*=2;
      //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
              &numBlocks,
              cuda_parallel_launch_constant_memory<DriverType>,
@ -157,7 +167,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
 template<class DriverType>
 struct CudaGetOptBlockSize<DriverType,false> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
    int numBlocks;
    int sharedmem;
@ -166,7 +177,8 @@ struct CudaGetOptBlockSize<DriverType,false> {
    while(blockSize<1024) {
      blockSize*=2;
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
              &numBlocks,
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -130,16 +130,17 @@ inline void cuda_intra_block_reduction( ValueType& value,
  cuda_inter_warp_reduction(value,join,max_active_thread);
 }
-template< class FunctorType , class JoinOp>
+template< class FunctorType , class JoinOp , class ArgTag = void >
 __device__
-bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type  value,
+bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  value,
                                 typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  neutral,
                                 const JoinOp& join,
                                 Cuda::size_type * const m_scratch_space,
-                                 typename FunctorValueTraits< FunctorType , void >::pointer_type const result,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
                                 Cuda::size_type * const m_scratch_flags,
                                 const int max_active_thread = blockDim.y) {
-  typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type;
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
-  typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type;
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
  //Do the intra-block reduction with shfl operations and static shared memory
  cuda_intra_block_reduction(value,join,max_active_thread);
@ -170,7 +171,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void
      if(id == 0)
        *m_scratch_flags = 0;
      last_block = true;
-      value = 0;
+      value = neutral;
      pointer_type const volatile global = (pointer_type) m_scratch_space ;
@ -366,7 +367,12 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
    size_type * const global = global_data + word_count.value * block_id ;
 #if (__CUDA_ARCH__ < 500)
    for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
 #else
    for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
 #endif
  }
  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@ -0,0 +1,179 @@
 /*
 //@HEADER
 // ************************************************************************
 // 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 // 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 // 
 // ************************************************************************
 //@HEADER
 */
 #include <Kokkos_Core.hpp>
 #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
 #include <impl/Kokkos_TaskQueue_impl.hpp>
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 namespace Kokkos {
 namespace Impl {
 template class TaskQueue< Kokkos::Cuda > ;
 //----------------------------------------------------------------------------
 __device__
 void TaskQueueSpecialization< Kokkos::Cuda >::driver
  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
 {
  using Member = TaskExec< Kokkos::Cuda > ;
  using Queue  = TaskQueue< Kokkos::Cuda > ;
  using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
  Member single_exec( 1 );
  Member team_exec( blockDim.y );
  const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
  union {
    task_root_type * ptr ;
    int              raw[2] ;
  } task ;
  // Loop until all queues are empty and no tasks in flight
  do {
    // Each team lead attempts to acquire either a thread team task
    // or collection of single thread tasks for the team.
    if ( 0 == warp_lane ) {
      task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
      // Loop by priority and then type
      for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
        for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
          task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
        }
      }
 #if 0
 printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
      , uintptr_t(task.ptr));
 #endif
    }
    // shuffle broadcast
    task.raw[0] = __shfl( task.raw[0] , 0 );
    task.raw[1] = __shfl( task.raw[1] , 0 );
    if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
    if ( end != task.ptr ) {
      if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
        // Thread Team Task
        (*task.ptr->m_apply)( task.ptr , & team_exec );
      }
      else if ( 0 == threadIdx.y ) {
        // Single Thread Task
        (*task.ptr->m_apply)( task.ptr , & single_exec );
      }
      if ( 0 == warp_lane ) {
        queue->complete( task.ptr ); 
      }
    }
  } while(1);
 }
 namespace {
 __global__
 void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
 { TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
 }
 void TaskQueueSpecialization< Kokkos::Cuda >::execute
  ( TaskQueue< Kokkos::Cuda > * const queue )
 {
  const int warps_per_block = 4 ;
  const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
  const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
  const int shared = 0 ;
  const cudaStream_t stream = 0 ;
  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
 #if 0
 printf("cuda_task_queue_execute before\n");
 #endif
  // Query the stack size, in bytes:
  //
  // size_t stack_size = 0 ;
  // CUDA_SAFE_CALL( cudaDeviceGetLimit( & stack_size , cudaLimitStackSize ) );
  //
  // If not large enough then set the stack size, in bytes:
  //
  // CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
  cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
  CUDA_SAFE_CALL( cudaGetLastError() );
  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
 #if 0
 printf("cuda_task_queue_execute after\n");
 #endif
 }
 }} /* namespace Kokkos::Impl */
 //----------------------------------------------------------------------------
 #endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@ -0,0 +1,519 @@
 /*
 //@HEADER
 // ************************************************************************
 // 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 // 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 // 
 // ************************************************************************
 //@HEADER
 */
 #ifndef KOKKOS_IMPL_CUDA_TASK_HPP
 #define KOKKOS_IMPL_CUDA_TASK_HPP
 #if defined( KOKKOS_ENABLE_TASKPOLICY )
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 namespace Kokkos {
 namespace Impl {
 namespace {
 template< typename TaskType >
 __global__
 void set_cuda_task_base_apply_function_pointer
  ( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
 { *ptr = TaskType::apply ; }
 }
 template<>
 class TaskQueueSpecialization< Kokkos::Cuda >
 {
 public:
  using execution_space = Kokkos::Cuda ;
  using memory_space    = Kokkos::CudaUVMSpace ;
  using queue_type      = TaskQueue< execution_space > ;
  static
  void iff_single_thread_recursive_execute( queue_type * const ) {}
  __device__
  static void driver( queue_type * const );
  static
  void execute( queue_type * const );
  template< typename FunctorType >
  static
  void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
    {
      using TaskType = TaskBase< execution_space
                               , typename FunctorType::value_type
                               , FunctorType > ;
      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
      set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
      CUDA_SAFE_CALL( cudaGetLastError() );
      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
    }
 };
 extern template class TaskQueue< Kokkos::Cuda > ;
 //----------------------------------------------------------------------------
 /**\brief  Impl::TaskExec<Cuda> is the TaskPolicy<Cuda>::member_type
 *         passed to tasks running in a Cuda space.
 *
 *  Cuda thread blocks for tasking are dimensioned:
 *    blockDim.x == vector length
 *    blockDim.y == team size
 *    blockDim.z == number of teams
 *  where
 *    blockDim.x * blockDim.y == WarpSize
 *
 *  Both single thread and thread team tasks are run by a full Cuda warp.
 *  A single thread task is called by warp lane #0 and the remaining
 *  lanes of the warp are idle.
 */
 template<>
 class TaskExec< Kokkos::Cuda >
 {
 private:
  TaskExec( TaskExec && ) = delete ;
  TaskExec( TaskExec const & ) = delete ;
  TaskExec & operator = ( TaskExec && ) = delete ;
  TaskExec & operator = ( TaskExec const & ) = delete ;
  friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
  const int m_team_size ;
  __device__
  TaskExec( int arg_team_size = blockDim.y )
    : m_team_size( arg_team_size ) {}
 public:
 #if defined( __CUDA_ARCH__ )
  __device__ void team_barrier() { /* __threadfence_block(); */ }
  __device__ int  team_rank() const { return threadIdx.y ; }
  __device__ int  team_size() const { return m_team_size ; }
 #else
  __host__ void team_barrier() {}
  __host__ int  team_rank() const { return 0 ; }
  __host__ int  team_size() const { return 0 ; }
 #endif
 };
 //----------------------------------------------------------------------------
 template<typename iType>
 struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
 {
  typedef iType index_type;
  const iType start ;
  const iType end ;
  const iType increment ;
  const TaskExec< Kokkos::Cuda > & thread;
 #if defined( __CUDA_ARCH__ )
  __device__ inline
  TeamThreadRangeBoundariesStruct
    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
    : start( threadIdx.y )
    , end(arg_count)
    , increment( blockDim.y )
    , thread(arg_thread)
    {}
  __device__ inline
  TeamThreadRangeBoundariesStruct
    ( const TaskExec< Kokkos::Cuda > & arg_thread
    , const iType & arg_start
    , const iType & arg_end
    )
    : start( arg_start + threadIdx.y )
    , end(   arg_end)
    , increment( blockDim.y )
    , thread( arg_thread )
    {}
 #else
  TeamThreadRangeBoundariesStruct
    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
  TeamThreadRangeBoundariesStruct
    ( const TaskExec< Kokkos::Cuda > & arg_thread
    , const iType & arg_start
    , const iType & arg_end
    );
 #endif
 };
 //----------------------------------------------------------------------------
 template<typename iType>
 struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
 {
  typedef iType index_type;
  const iType start ;
  const iType end ;
  const iType increment ;
  const TaskExec< Kokkos::Cuda > & thread;
 #if defined( __CUDA_ARCH__ )
  __device__ inline
  ThreadVectorRangeBoundariesStruct
    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
    : start( threadIdx.x )
    , end(arg_count)
    , increment( blockDim.x )
    , thread(arg_thread)
    {}
 #else
  ThreadVectorRangeBoundariesStruct
    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
 #endif
 };
 }} /* namespace Kokkos::Impl */
 //----------------------------------------------------------------------------
 namespace Kokkos {
 template<typename iType>
 KOKKOS_INLINE_FUNCTION
 Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
 TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
               , const iType & count )
 {
  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
 }
 template<typename iType>
 KOKKOS_INLINE_FUNCTION
 Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
 TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end )
 {
  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >(thread,start,end);
 }
 template<typename iType>
 KOKKOS_INLINE_FUNCTION
 Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
 ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
               , const iType & count )
 {
  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
 }
 /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
 *
 * The range i=0..N-1 is mapped to all threads of the the calling thread team.
 * This functionality requires C++11 support.
 */
 template<typename iType, class Lambda>
 KOKKOS_INLINE_FUNCTION
 void parallel_for
  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
  , const Lambda& lambda
  )
 {
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i);
  }
 }
 // reduce across corresponding lanes between team members within warp
 // assume stride*team_size == warp_size
 template< typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
 void strided_shfl_warp_reduction
  (const JoinType& join,
   ValueType& val,
   int team_size,
   int stride)
 {
  for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
    join(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
  }
 }
 // multiple within-warp non-strided reductions
 template< typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
 void multi_shfl_warp_reduction
  (const JoinType& join,
   ValueType& val,
   int vec_length)
 {
  for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
    join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
  }
 }
 // broadcast within warp
 template< class ValueType >
 KOKKOS_INLINE_FUNCTION
 ValueType shfl_warp_broadcast
  (ValueType& val,
   int src_lane,
   int width)
 {
  return Kokkos::shfl(val, src_lane, width);
 }
 // all-reduce across corresponding vector lanes between team members within warp
 // assume vec_length*team_size == warp_size 
 // blockDim.x == vec_length == stride
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
   const Lambda & lambda,
   const JoinType& join,
   ValueType& initialized_result) {
  ValueType result = initialized_result;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }
  initialized_result = result;
  strided_shfl_warp_reduction<ValueType, JoinType>(
                          join,
                          initialized_result,
                          loop_boundaries.thread.team_size(),
                          blockDim.x);
  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
 }
 // all-reduce across corresponding vector lanes between team members within warp
 // if no join() provided, use sum
 // assume vec_length*team_size == warp_size 
 // blockDim.x == vec_length == stride
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
   const Lambda & lambda,
   ValueType& initialized_result) {
  //TODO what is the point of creating this temporary?
  ValueType result = initialized_result;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }
  initialized_result = result;
  strided_shfl_warp_reduction(
                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
                          initialized_result,
                          loop_boundaries.thread.team_size(),
                          blockDim.x);
  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
 }
 // all-reduce within team members within warp
 // assume vec_length*team_size == warp_size 
 // blockDim.x == vec_length == stride
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
   const Lambda & lambda,
   const JoinType& join,
   ValueType& initialized_result) {
  ValueType result = initialized_result;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }
  initialized_result = result;
  multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
 }
 // all-reduce within team members within warp
 // if no join() provided, use sum
 // assume vec_length*team_size == warp_size 
 // blockDim.x == vec_length == stride
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
   const Lambda & lambda,
   ValueType& initialized_result) {
  ValueType result = initialized_result;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }
  initialized_result = result;
  //initialized_result = multi_shfl_warp_reduction(
  multi_shfl_warp_reduction(
                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
                          initialized_result,
                          blockDim.x);
  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
 }
 // scan across corresponding vector lanes between team members within warp
 // assume vec_length*team_size == warp_size 
 // blockDim.x == vec_length == stride
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
 template< typename ValueType, typename iType, class Lambda >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
   const Lambda & lambda) {
  ValueType accum = 0 ;
  ValueType val, y, local_total;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    val = 0;
    lambda(i,val,false);
    // intra-blockDim.y exclusive scan on 'val'
    // accum = accumulated, sum in total for this iteration
    // INCLUSIVE scan
    for( int offset = blockDim.x ; offset < Impl::CudaTraits::WarpSize ; offset <<= 1 ) {
      y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
      if(threadIdx.y*blockDim.x >= offset) { val += y; }
    }
    // pass accum to all threads
    local_total = shfl_warp_broadcast<ValueType>(val,
                                            threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
                                            Impl::CudaTraits::WarpSize);
    // make EXCLUSIVE scan by shifting values over one
    val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
    if ( threadIdx.y == 0 ) { val = 0 ; }
    val += accum;
    lambda(i,val,true);
    accum += local_total;
  }
 }
 // scan within team member (vector) within warp
 // assume vec_length*team_size == warp_size 
 // blockDim.x == vec_length == stride
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
   const Lambda & lambda)
 {
  ValueType accum = 0 ;
  ValueType val, y, local_total;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    val = 0;
    lambda(i,val,false);
    // intra-blockDim.x exclusive scan on 'val'
    // accum = accumulated, sum in total for this iteration
    // INCLUSIVE scan
    for( int offset = 1 ; offset < blockDim.x ; offset <<= 1 ) {
      y = Kokkos::shfl_up(val, offset, blockDim.x);
      if(threadIdx.x >= offset) { val += y; }
    }
    // pass accum to all threads
    local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
    // make EXCLUSIVE scan by shifting values over one
    val = Kokkos::shfl_up(val, 1, blockDim.x);
    if ( threadIdx.x == 0 ) { val = 0 ; }
    val += accum;
    lambda(i,val,true);
    accum += local_total;
  }
 }
 } /* namespace Kokkos */
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 #endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
@ -46,9 +46,10 @@
 #include <stdio.h>
 #include <iostream>
 #include <sstream>
 #include <Kokkos_Core.hpp>
 #include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
-#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY )
+#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
 // #define DETAILED_PRINT
@ -93,9 +94,8 @@ CudaTaskPolicyQueue
  , const unsigned arg_team_size
  )
  : m_space( Kokkos::CudaUVMSpace()
-           , arg_task_max_size
+           , arg_task_max_size * arg_task_max_count * 1.2
-           , arg_task_max_size * arg_task_max_count
+           , 16 /* log2(superblock size) */
           , 1 /* only one level of memory pool */
           )
  , m_team { 0 , 0 , 0 }
  , m_serial { 0 , 0 , 0 }
@ -172,6 +172,8 @@ if ( IS_TEAM_LEAD && 0 != team_task ) {
          member( kokkos_impl_cuda_shared_memory<void>()
                , 16                      /* shared_begin */
                , team_task->m_shmem_size /* shared size */
                , 0                       /* scratch level 1 pointer */
                , 0                       /* scratch level 1 size */
                , 0                       /* league rank */
                , 1                       /* league size */
                );
@ -926,5 +928,5 @@ void Task::clear_dependence()
 } /* namespace Kokkos */
-#endif  /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */
+#endif  /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp
@ -47,19 +47,11 @@
 #define KOKKOS_CUDA_TASKPOLICY_HPP
 #include <Kokkos_Core_fwd.hpp>
 #if defined( KOKKOS_HAVE_CUDA ) && \
    defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
 #define KOKKOS_ENABLE_CUDA_TASK_POLICY
 /* The TaskPolicy< Cuda > capability requires nvcc using the option:
 *    --relocatable-device-code=true
 */
 #include <Kokkos_Cuda.hpp>
 #include <Kokkos_TaskPolicy.hpp>
 #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
 //----------------------------------------------------------------------------
 namespace Kokkos {
@ -81,8 +73,6 @@ public:
 private:
  friend struct CudaTaskPolicyQueue ;
  CudaTaskPolicyQueue   * m_policy ;
  TaskMember * volatile * m_queue ;
  function_team_type      m_team ;    ///< Apply function on CUDA
@ -819,9 +809,11 @@ public:
  static member_type member_single()
    {
      return
-        member_type( 0 /* shared memory */
+        member_type( 0 /* shared memory pointer */
-                   , 0 /* shared memory begin */
+                   , 0 /* shared memory begin offset */
-                   , 0 /* shared memory size */
+                   , 0 /* shared memory end offset */
                   , 0 /* scratch level_1 pointer */
                   , 0 /* scratch level_1 size */
                   , 0 /* league rank */
                   , 1 /* league size */ );
    }
@ -832,10 +824,10 @@ public:
 } /* namespace Experimental */
 } /* namespace Kokkos */
 #endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) */
 //----------------------------------------------------------------------------
 #endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -56,8 +56,6 @@
 #include <impl/Kokkos_Shape.hpp>
 #include <Kokkos_View.hpp>
 #include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -90,343 +88,6 @@ struct AssertShapeBoundsAbort< CudaSpace >
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 #if ! KOKKOS_USING_EXP_VIEW
 namespace Kokkos {
 namespace Impl {
 //----------------------------------------------------------------------------
 // Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
 // Via reinterpret_case this can be used to support all scalar types of those sizes.
 // Any other scalar type falls back to either normal reads out of global memory,
 // or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
 template< typename ValueType
        , class MemorySpace
        , class AliasType =
            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  4 ) , int ,
            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  8 ) , ::int2 ,
            typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 ,
            typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 32 ) , ::float4 ,void
            >::type
            >::type
            >::type
            >::type
        >
 class CudaTextureFetch {
 private:
  cuda_texture_object_type  m_obj ;
  const ValueType         * m_alloc_ptr ;
  int                       m_offset ;
  void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
  {
    typedef char const * const byte;
    m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
    size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
    const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
    const size_t count = tracker.alloc_size() / sizeof(ValueType);
    const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
    if (ok_aligned && ok_contains) {
      if (tracker.attribute() == NULL ) {
        MemorySpace::texture_object_attach(
            tracker
            , sizeof(ValueType)
            , cudaCreateChannelDesc< AliasType >()
            );
      }
      m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
      m_offset = arg_ptr - m_alloc_ptr;
    }
    else if( !ok_contains ) {
      throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
    }
    else {
      throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
    }
  }
 public:
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
  KOKKOS_INLINE_FUNCTION
  ~CudaTextureFetch() {}
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch( const CudaTextureFetch & rhs )
    : m_obj(       rhs.m_obj )
    , m_alloc_ptr( rhs.m_alloc_ptr )
    , m_offset(    rhs.m_offset )
    {}
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
    {
      m_obj       = rhs.m_obj ;
      m_alloc_ptr = rhs.m_alloc_ptr ;
      m_offset    = rhs.m_offset ;
      return *this ;
    }
  KOKKOS_INLINE_FUNCTION explicit
  CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
    : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
    {
      #if defined( KOKKOS_USE_LDG_INTRINSIC )
        m_alloc_ptr(arg_ptr);
      #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
        if ( arg_ptr != NULL ) {
          if ( tracker.is_valid() ) {
            attach( arg_ptr, tracker );
          }
          else {
            AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
            if ( found_tracker.is_valid() ) {
              attach( arg_ptr, found_tracker );
            } else {
              throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
            }
          }
        }
      #endif
    }
  KOKKOS_INLINE_FUNCTION
  operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
  template< typename iType >
  KOKKOS_INLINE_FUNCTION
  ValueType operator[]( const iType & i ) const
    {
      #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
        AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
        return  *(reinterpret_cast<ValueType*> (&v));
      #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
        AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
        return  *(reinterpret_cast<ValueType*> (&v));
      #else
        return m_alloc_ptr[ i + m_offset ];
      #endif
  }
 };
 template< typename ValueType, class MemorySpace >
 class CudaTextureFetch< const ValueType, MemorySpace, float4 > {
 private:
  typedef float4 AliasType;
  cuda_texture_object_type  m_obj ;
  const ValueType         * m_alloc_ptr ;
  int                       m_offset ;
  void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
  {
    typedef char const * const byte;
    m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
    size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
    const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
    const size_t count = tracker.alloc_size() / sizeof(ValueType);
    const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
    if (ok_aligned && ok_contains) {
      if (tracker.attribute() == NULL ) {
        MemorySpace::texture_object_attach(
            tracker
            , sizeof(ValueType)
            , cudaCreateChannelDesc< AliasType >()
            );
      }
      m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
      m_offset = arg_ptr - m_alloc_ptr;
    }
    else if( !ok_contains ) {
      throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
    }
    else {
      throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
    }
  }
 public:
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
  KOKKOS_INLINE_FUNCTION
  ~CudaTextureFetch() {}
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch( const CudaTextureFetch & rhs )
    : m_obj(       rhs.m_obj )
    , m_alloc_ptr( rhs.m_alloc_ptr )
    , m_offset(    rhs.m_offset )
    {}
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
    {
      m_obj       = rhs.m_obj ;
      m_alloc_ptr = rhs.m_alloc_ptr ;
      m_offset    = rhs.m_offset ;
      return *this ;
    }
  KOKKOS_INLINE_FUNCTION explicit
  CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
    : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
    {
      #if defined( KOKKOS_USE_LDG_INTRINSIC )
        m_alloc_ptr(arg_ptr);
      #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
        if ( arg_ptr != NULL ) {
          if ( tracker.is_valid() ) {
            attach( arg_ptr, tracker );
          }
          else {
            AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
            if ( found_tracker.is_valid() ) {
              attach( arg_ptr, found_tracker );
            } else {
              throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
            }
          }
        }
      #endif
    }
  KOKKOS_INLINE_FUNCTION
  operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
  template< typename iType >
  KOKKOS_INLINE_FUNCTION
  ValueType operator[]( const iType & i ) const
    {
      #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
        AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
        return  *(reinterpret_cast<ValueType*> (&v));
      #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
        union Float4ValueType {
          float4 f4[2];
          ValueType val;
        };
        Float4ValueType convert;
        convert.f4[0] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset) );
        convert.f4[1] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset)+1 );
        return  convert.val;
      #else
        return m_alloc_ptr[ i + m_offset ];
      #endif
  }
 };
 template< typename ValueType, class MemorySpace >
 class CudaTextureFetch< const ValueType, MemorySpace, void >
 {
 private:
  const ValueType * m_ptr ;
 public:
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch() : m_ptr(0) {};
  KOKKOS_INLINE_FUNCTION
  ~CudaTextureFetch() {
  }
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {}
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {}
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
    m_ptr = rhs.m_ptr;
    return *this ;
  }
  explicit KOKKOS_INLINE_FUNCTION
  CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) {
    m_ptr = base_view_ptr;
  }
  KOKKOS_INLINE_FUNCTION
  CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
    m_ptr = base_view_ptr;
    return *this;
  }
  KOKKOS_INLINE_FUNCTION
  operator const ValueType * () const { return m_ptr ; }
  template< typename iType >
  KOKKOS_INLINE_FUNCTION
  ValueType operator[]( const iType & i ) const
  {
    return m_ptr[ i ];
  }
 };
 } // namespace Impl
 } // namespace Kokkos
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 namespace Kokkos {
 namespace Impl {
 /** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
 *          if 'const' value type, CudaSpace and random access.
 */
 template< class ViewTraits >
 class ViewDataHandle< ViewTraits ,
  typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value ||
                        is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value )
                      &&
                      is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value
                      &&
                      ViewTraits::memory_traits::RandomAccess
                    >::type >
 {
 public:
  enum { ReturnTypeIsReference = false };
  typedef Impl::CudaTextureFetch< typename ViewTraits::value_type
                                , typename ViewTraits::memory_space> handle_type;
  KOKKOS_INLINE_FUNCTION
  static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker )
  {
    return handle_type(arg_data_ptr, arg_tracker);
  }
  typedef typename ViewTraits::value_type return_type;
 };
 }
 }
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 #endif // KOKKOS_HAVE_CUDA
 #endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@ -0,0 +1,611 @@
 /*
 //@HEADER
 // ************************************************************************
 //
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 //
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 //
 // ************************************************************************
 //@HEADER
 */
 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
 #include <Kokkos_ExecPolicy.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <initializer_list>
 #if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
 #define KOKKOS_MDRANGE_IVDEP
 #endif
 namespace Kokkos { namespace Experimental {
 enum class Iterate
 {
  Default, // Default for the device
  Left,    // Left indices stride fastest
  Right,   // Right indices stride fastest
  Flat,    // Do not tile, only valid for inner direction
 };
 template <typename ExecSpace>
 struct default_outer_direction
 {
  using type = Iterate;
  static constexpr Iterate value = Iterate::Right;
 };
 template <typename ExecSpace>
 struct default_inner_direction
 {
  using type = Iterate;
  static constexpr Iterate value = Iterate::Right;
 };
 // Iteration Pattern
 template < unsigned N
         , Iterate OuterDir = Iterate::Default
         , Iterate InnerDir = Iterate::Default
         >
 struct Rank
 {
  static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
  static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
  static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
  static constexpr int rank = N;
  static constexpr Iterate outer_direction = OuterDir;
  static constexpr Iterate inner_direction = InnerDir;
 };
 // multi-dimensional iteration pattern
 template <typename... Properties>
 struct MDRangePolicy
 {
  using range_policy = RangePolicy<Properties...>;
  static_assert( !std::is_same<range_policy,void>::value
               , "Kokkos Error: MD iteration pattern not defined" );
  using iteration_pattern   = typename range_policy::iteration_pattern;
  using work_tag            = typename range_policy::work_tag;
  static constexpr int rank = iteration_pattern::rank;
  static constexpr int outer_direction = static_cast<int> (
      (iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
    ? iteration_pattern::outer_direction
    : default_outer_direction< typename range_policy::execution_space>::value );
  static constexpr int inner_direction = static_cast<int> (
      iteration_pattern::inner_direction != Iterate::Default
    ? iteration_pattern::inner_direction
    : default_inner_direction< typename range_policy::execution_space>::value ) ;
  // Ugly ugly workaround intel 14 not handling scoped enum correctly
  static constexpr int Flat = static_cast<int>( Iterate::Flat );
  static constexpr int Right = static_cast<int>( Iterate::Right );
  using size_type   = typename range_policy::index_type;
  using index_type  = typename std::make_signed<size_type>::type;
  template <typename I>
  MDRangePolicy( std::initializer_list<I> upper_corner )
  {
    static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
    // TODO check size of lists equal to rank
    // static_asserts on initializer_list.size() require c++14
    //static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
    const auto u = upper_corner.begin();
    m_num_tiles = 1;
    for (int i=0; i<rank; ++i) {
      m_offset[i] = static_cast<index_type>(0);
      m_dim[i]    = static_cast<index_type>(u[i]);
      if (inner_direction != Flat) {
        // default tile size to 4
        m_tile[i] = 4;
      } else {
        m_tile[i] = 1;
      }
      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
      m_num_tiles *= m_tile_dim[i];
    }
  }
  template <typename IA, typename IB>
  MDRangePolicy( std::initializer_list<IA> corner_a
               , std::initializer_list<IB> corner_b
               )
  {
    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
    // TODO check size of lists equal to rank
    // static_asserts on initializer_list.size() require c++14
    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
    using A = typename std::make_signed<IA>::type;
    using B = typename std::make_signed<IB>::type;
    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
    m_num_tiles = 1;
    for (int i=0; i<rank; ++i) {
      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
      if (inner_direction != Flat) {
        // default tile size to 4
        m_tile[i] = 4;
      } else {
        m_tile[i] = 1;
      }
      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
      m_num_tiles *= m_tile_dim[i];
    }
  }
  template <typename IA, typename IB, typename T>
  MDRangePolicy( std::initializer_list<IA> corner_a
               , std::initializer_list<IB> corner_b
               , std::initializer_list<T> tile
               )
  {
    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
    static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
    static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
    // TODO check size of lists equal to rank
    // static_asserts on initializer_list.size() require c++14
    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
    //static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
    using A = typename std::make_signed<IA>::type;
    using B = typename std::make_signed<IB>::type;
    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
    const auto t = tile.begin();
    m_num_tiles = 1;
    for (int i=0; i<rank; ++i) {
      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
      m_tile[i]   = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
      m_num_tiles *= m_tile_dim[i];
    }
  }
  index_type   m_offset[rank];
  index_type   m_dim[rank];
  int          m_tile[rank];
  index_type   m_tile_dim[rank];
  size_type    m_num_tiles;       // product of tile dims
 };
 namespace Impl {
 // Serial, Threads, OpenMP
 // use enable_if to overload for Cuda
 template < typename MDRange, typename Functor, typename Enable = void >
 struct MDForFunctor
 {
  using work_tag   = typename MDRange::work_tag;
  using index_type = typename MDRange::index_type;
  using size_type  = typename MDRange::size_type;
  MDRange m_range;
  Functor m_func;
  KOKKOS_INLINE_FUNCTION
  MDForFunctor( MDRange const& range, Functor const& f )
    : m_range(range)
    , m_func( f )
  {}
  KOKKOS_INLINE_FUNCTION
  MDForFunctor( MDRange const& range, Functor && f )
    : m_range(range)
    , m_func( std::forward<Functor>(f) )
  {}
  KOKKOS_INLINE_FUNCTION
  MDForFunctor( MDRange && range, Functor const& f )
    : m_range( std::forward<MDRange>(range) )
    , m_func( f )
  {}
  KOKKOS_INLINE_FUNCTION
  MDForFunctor( MDRange && range, Functor && f )
    : m_range( std::forward<MDRange>(range) )
    , m_func( std::forward<Functor>(f) )
  {}
  KOKKOS_INLINE_FUNCTION
  MDForFunctor( MDForFunctor const& ) = default;
  KOKKOS_INLINE_FUNCTION
  MDForFunctor& operator=( MDForFunctor const& ) = default;
  KOKKOS_INLINE_FUNCTION
  MDForFunctor( MDForFunctor && ) = default;
  KOKKOS_INLINE_FUNCTION
  MDForFunctor& operator=( MDForFunctor && ) = default;
  // Rank-2, Flat, No Tag
  template <typename Idx>
  KOKKOS_FORCEINLINE_FUNCTION
  typename std::enable_if<(  std::is_integral<Idx>::value
                          && std::is_same<void, work_tag>::value
                          && MDRange::rank == 2
                          && MDRange::inner_direction == MDRange::Flat
                          )>::type
  operator()(Idx t) const
  {
    if (  MDRange::outer_direction == MDRange::Right ) {
      m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
    } else {
      m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
    }
  }
  // Rank-2, Flat, Tag
  template <typename Idx>
  KOKKOS_FORCEINLINE_FUNCTION
  typename std::enable_if<(  std::is_integral<Idx>::value
                          && !std::is_same<void, work_tag>::value
                          && MDRange::rank == 2
                          && MDRange::inner_direction == MDRange::Flat
                          )>::type
  operator()(Idx t) const
  {
    if (  MDRange::outer_direction == MDRange::Right ) {
      m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
    } else {
      m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
    }
  }
  // Rank-2, Not Flat, No Tag
  template <typename Idx>
  KOKKOS_FORCEINLINE_FUNCTION
  typename std::enable_if<(  std::is_integral<Idx>::value
                          && std::is_same<void, work_tag>::value
                          && MDRange::rank == 2
                          && MDRange::inner_direction != MDRange::Flat
                          )>::type
  operator()(Idx t) const
  {
    index_type t0, t1;
    if (  MDRange::outer_direction == MDRange::Right ) {
      t0 = t / m_range.m_tile_dim[1];
      t1 = t % m_range.m_tile_dim[1];
    } else {
      t0 = t % m_range.m_tile_dim[0];
      t1 = t / m_range.m_tile_dim[0];
    }
    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
    if (  MDRange::inner_direction == MDRange::Right ) {
      for (int i0=b0; i0<e0; ++i0) {
      #if defined(KOKKOS_MDRANGE_IVDEP)
      #pragma ivdep
      #endif
      for (int i1=b1; i1<e1; ++i1) {
        m_func( i0, i1 );
      }}
    } else {
      for (int i1=b1; i1<e1; ++i1) {
      #if defined(KOKKOS_MDRANGE_IVDEP)
      #pragma ivdep
      #endif
      for (int i0=b0; i0<e0; ++i0) {
        m_func( i0, i1 );
      }}
    }
  }
  // Rank-2, Not Flat, Tag
  template <typename Idx>
  KOKKOS_FORCEINLINE_FUNCTION
  typename std::enable_if<(  std::is_integral<Idx>::value
                          && !std::is_same<void, work_tag>::value
                          && MDRange::rank == 2
                          && MDRange::inner_direction != MDRange::Flat
                          )>::type
  operator()(Idx t) const
  {
    work_tag tag;
    index_type t0, t1;
    if (  MDRange::outer_direction == MDRange::Right ) {
      t0 = t / m_range.m_tile_dim[1];
      t1 = t % m_range.m_tile_dim[1];
    } else {
      t0 = t % m_range.m_tile_dim[0];
      t1 = t / m_range.m_tile_dim[0];
    }
    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
    if (  MDRange::inner_direction == MDRange::Right ) {
      for (int i0=b0; i0<e0; ++i0) {
      #if defined(KOKKOS_MDRANGE_IVDEP)
      #pragma ivdep
      #endif
      for (int i1=b1; i1<e1; ++i1) {
        m_func( tag, i0, i1 );
      }}
    } else {
      for (int i1=b1; i1<e1; ++i1) {
      #if defined(KOKKOS_MDRANGE_IVDEP)
      #pragma ivdep
      #endif
      for (int i0=b0; i0<e0; ++i0) {
        m_func( tag, i0, i1 );
      }}
    }
  }
  //---------------------------------------------------------------------------
  // Rank-3, Flat, No Tag
  template <typename Idx>
  KOKKOS_FORCEINLINE_FUNCTION
  typename std::enable_if<(  std::is_integral<Idx>::value
                          && std::is_same<void, work_tag>::value
                          && MDRange::rank == 3
                          && MDRange::inner_direction == MDRange::Flat
                          )>::type
  operator()(Idx t) const
  {
    if (  MDRange::outer_direction == MDRange::Right ) {
    const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
    m_func( m_range.m_offset[0] + (  t / tmp_prod )
          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
          , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
          );
    } else {
    const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
    m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
          , m_range.m_offset[2] + (  t / tmp_prod )
          );
    }
  }
  // Rank-3, Flat, Tag
  template <typename Idx>
  KOKKOS_FORCEINLINE_FUNCTION
  typename std::enable_if<(  std::is_integral<Idx>::value
                          && !std::is_same<void, work_tag>::value
                          && MDRange::rank == 3
                          && MDRange::inner_direction == MDRange::Flat
                          )>::type
  operator()(Idx t) const
  {
    if (  MDRange::outer_direction == MDRange::Right ) {
      const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
      m_func( work_tag{}
            , m_range.m_offset[0] + (  t / tmp_prod )
            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
            , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
            );
    } else {
      const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
      m_func( work_tag{}
            , m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
            , m_range.m_offset[2] + (  t / tmp_prod )
            );
    }
  }
  // Rank-3, Not Flat, No Tag
  template <typename Idx>
  KOKKOS_FORCEINLINE_FUNCTION
  typename std::enable_if<(  std::is_integral<Idx>::value
                          && std::is_same<void, work_tag>::value
                          && MDRange::rank == 3
                          && MDRange::inner_direction != MDRange::Flat
                          )>::type
  operator()(Idx t) const
  {
    index_type t0, t1, t2;
    if (  MDRange::outer_direction == MDRange::Right ) {
      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
      t0 = t / tmp_prod;
      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
    } else {
      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
      t2 = t / tmp_prod;
    }
    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
    if (  MDRange::inner_direction == MDRange::Right ) {
      for (int i0=b0; i0<e0; ++i0) {
      for (int i1=b1; i1<e1; ++i1) {
      #if defined(KOKKOS_MDRANGE_IVDEP)
      #pragma ivdep
      #endif
      for (int i2=b2; i2<e2; ++i2) {
        m_func( i0, i1, i2 );
      }}}
    } else {
      for (int i2=b2; i2<e2; ++i2) {
      for (int i1=b1; i1<e1; ++i1) {
      #if defined(KOKKOS_MDRANGE_IVDEP)
      #pragma ivdep
      #endif
      for (int i0=b0; i0<e0; ++i0) {
        m_func( i0, i1, i2 );
      }}}
    }
  }
  // Rank-3, Not Flat, Tag
  template <typename Idx>
  KOKKOS_FORCEINLINE_FUNCTION
  typename std::enable_if<(  std::is_integral<Idx>::value
                          && !std::is_same<void, work_tag>::value
                          && MDRange::rank == 3
                          && MDRange::inner_direction != MDRange::Flat
                          )>::type
  operator()(Idx t) const
  {
    work_tag tag;
    index_type t0, t1, t2;
    if (  MDRange::outer_direction == MDRange::Right ) {
      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
      t0 = t / tmp_prod;
      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
    } else {
      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
      t2 = t / tmp_prod;
    }
    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
    if (  MDRange::inner_direction == MDRange::Right ) {
      for (int i0=b0; i0<e0; ++i0) {
      for (int i1=b1; i1<e1; ++i1) {
      #if defined(KOKKOS_MDRANGE_IVDEP)
      #pragma ivdep
      #endif
      for (int i2=b2; i2<e2; ++i2) {
        m_func( tag, i0, i1, i2 );
      }}}
    } else {
      for (int i2=b2; i2<e2; ++i2) {
      for (int i1=b1; i1<e1; ++i1) {
      #if defined(KOKKOS_MDRANGE_IVDEP)
      #pragma ivdep
      #endif
      for (int i0=b0; i0<e0; ++i0) {
        m_func( tag, i0, i1, i2 );
      }}}
    }
  }
 };
 } // namespace Impl
 template <typename MDRange, typename Functor>
 void md_parallel_for( MDRange const& range
                    , Functor const& f
                    , const std::string& str = ""
                    )
 {
  Impl::MDForFunctor<MDRange, Functor> g(range, f);
  using range_policy = typename MDRange::range_policy;
  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
 }
 template <typename MDRange, typename Functor>
 void md_parallel_for( const std::string& str
                    , MDRange const& range
                    , Functor const& f
                    )
 {
  Impl::MDForFunctor<MDRange, Functor> g(range, f);
  using range_policy = typename MDRange::range_policy;
  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
 }
 }} // namespace Kokkos::Experimental
 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
--- a/lib/kokkos/core/src/KokkosExp_View.hpp
+++ b/lib/kokkos/core/src/KokkosExp_View.hpp
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@ -121,13 +121,22 @@ public:
    return *this;
  }
-  //! Assignment operator.
+  /// \brief Assignment operator, for volatile <tt>*this</tt> and
  ///   nonvolatile input.
  ///
  /// \param src [in] Input; right-hand side of the assignment.
  ///
  /// This operator returns \c void instead of <tt>volatile
  /// complex<RealType>& </tt>.  See Kokkos Issue #177 for the
  /// explanation.  In practice, this means that you should not chain
  /// assignments with volatile lvalues.
  template<class InputRealType>
  KOKKOS_INLINE_FUNCTION
-  volatile complex<RealType>& operator= (const complex<InputRealType>& src) volatile {
+  void operator= (const complex<InputRealType>& src) volatile {
    re_ = src.re_;
    im_ = src.im_;
-    return *this;
+    // We deliberately do not return anything here.  See explanation
    // in public documentation above.
  }
  //! Assignment operator.
--- a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp
@ -41,81 +41,38 @@
 //@HEADER
 */
-#ifndef KOKKOS_BASIC_ALLOCATORS_HPP
+#ifndef KOKKOS_CORE_CONCEPTS_HPP
-#define KOKKOS_BASIC_ALLOCATORS_HPP
+#define KOKKOS_CORE_CONCEPTS_HPP
-#if ! KOKKOS_USING_EXP_VIEW
+#include <type_traits>
-namespace Kokkos { namespace Impl {
+namespace Kokkos {
 //Schedules for Execution Policies
 struct Static {};
 struct Dynamic {};
-/// class UnmanagedAllocator
+//Schedule Wrapper Type
-/// does nothing when deallocate(ptr,size) is called
+template<class T>
-class UnmanagedAllocator
+struct Schedule
 {
-public:
+  static_assert(  std::is_same<T,Static>::value
-  static const char * name() { return "Unmanaged Allocator"; }
+               || std::is_same<T,Dynamic>::value
-
+               , "Kokkos: Invalid Schedule<> type."
-  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+               );
  using schedule_type = Schedule<T>;
  using type = T;
 };
-
+//Specify Iteration Index Type
-/// class MallocAllocator
+template<typename T>
-class MallocAllocator
+struct IndexType
 {
-public:
+  static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>.");
-  static const char * name()
+  using index_type = IndexType<T>;
-  {
+  using type = T;
    return "Malloc Allocator";
  }
  static void* allocate(size_t size);
  static void deallocate(void * ptr, size_t size);
  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
 };
 } // namespace Kokkos
-/// class AlignedAllocator
+#endif // KOKKOS_CORE_CONCEPTS_HPP
 /// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT
 class AlignedAllocator
 {
 public:
  static const char * name()
  {
    return "Aligned Allocator";
  }
  static void* allocate(size_t size);
  static void deallocate(void * ptr, size_t size);
  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
 };
 /// class PageAlignedAllocator
 /// memory aligned to PAGE_SIZE
 class PageAlignedAllocator
 {
 public:
  static const char * name()
  {
    return "Page Aligned Allocator";
  }
  static void* allocate(size_t size);
  static void deallocate(void * ptr, size_t size);
  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
 };
 }} // namespace Kokkos::Impl
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
 #endif //KOKKOS_BASIC_ALLOCATORS_HPP
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -159,8 +159,6 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
 } // namespace Kokkos
 #if KOKKOS_USING_EXP_VIEW
 namespace Kokkos {
 using Kokkos::Experimental::kokkos_malloc ;
@ -169,76 +167,6 @@ using Kokkos::Experimental::kokkos_free ;
 }
 #else
 namespace Kokkos {
 namespace Impl {
 // should only by used by kokkos_malloc and kokkos_free
 struct MallocHelper
 {
  static void increment_ref_count( AllocationTracker const & tracker )
  {
    tracker.increment_ref_count();
  }
  static void decrement_ref_count( AllocationTracker const & tracker )
  {
    tracker.decrement_ref_count();
  }
 };
 } // namespace Impl
 /* Allocate memory from a memory space.
 * The allocation is tracked in Kokkos memory tracking system, so
 * leaked memory can be identified.
 */
 template< class Arg = DefaultExecutionSpace>
 void* kokkos_malloc(const std::string label, size_t count) {
  if(count == 0) return NULL;
  typedef typename Arg::memory_space MemorySpace;
  Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);;
  Impl::MallocHelper::increment_ref_count( tracker );
  return tracker.alloc_ptr();
 }
 template< class Arg = DefaultExecutionSpace>
 void* kokkos_malloc(const size_t& count) {
  return kokkos_malloc<Arg>("DefaultLabel",count);
 }
 /* Free memory from a memory space.
 */
 template< class Arg = DefaultExecutionSpace>
 void kokkos_free(const void* ptr) {
  typedef typename Arg::memory_space MemorySpace;
  typedef typename MemorySpace::allocator allocator;
  Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr);
  if (tracker.is_valid()) {
    Impl::MallocHelper::decrement_ref_count( tracker );
  }
 }
 template< class Arg = DefaultExecutionSpace>
 void* kokkos_realloc(const void* old_ptr, size_t size) {
  if(old_ptr == NULL)
    return kokkos_malloc<Arg>(size);
  typedef typename Arg::memory_space MemorySpace;
  typedef typename MemorySpace::allocator allocator;
  Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
  tracker.reallocate(size);
  return tracker.alloc_ptr();
 }
 } // namespace Kokkos
 #endif
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@ -69,6 +69,9 @@ namespace {
 /**\brief Token to indicate that a parameter's value is to be automatically selected */
 constexpr AUTO_t AUTO = Kokkos::AUTO_t();
 }
 struct InvalidType {};
 }
 //----------------------------------------------------------------------------
@ -225,7 +228,7 @@ template< class FunctorType , class ExecPolicy , class ExecutionSpace =
 ///
 /// This is an implementation detail of parallel_reduce.  Users should
 /// skip this and go directly to the nonmember function parallel_reduce.
-template< class FunctorType , class ExecPolicy , class ExecutionSpace = 
+template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space 
        > class ParallelReduce ;
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@ -56,11 +56,14 @@
 #include <Kokkos_CudaSpace.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskPolicy.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 /*--------------------------------------------------------------------------*/
 namespace Kokkos {
@ -257,10 +260,10 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Cuda/Kokkos_CudaExec.hpp>
 #include <Cuda/Kokkos_Cuda_View.hpp>
 #include <KokkosExp_View.hpp>
 #include <Cuda/KokkosExp_Cuda_View.hpp>
 #include <Cuda/Kokkos_Cuda_Parallel.hpp>
 #include <Cuda/Kokkos_Cuda_Task.hpp>
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@ -54,10 +54,7 @@
 #include <Kokkos_HostSpace.hpp>
 #include <impl/Kokkos_AllocationTracker.hpp>
 #include <Cuda/Kokkos_Cuda_abort.hpp>
 #include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
 /*--------------------------------------------------------------------------*/
@ -77,33 +74,6 @@ public:
  /*--------------------------------*/
 #if ! KOKKOS_USING_EXP_VIEW
  typedef Impl::CudaMallocAllocator allocator;
  /** \brief  Allocate a contiguous block of memory.
   *
   *  The input label is associated with the block of memory.
   *  The block of memory is tracked via reference counting where
   *  allocation gives it a reference count of one.
   */
  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
  /*--------------------------------*/
  /** \brief  Cuda specific function to attached texture object to an allocation.
   *          Output the texture object, base pointer, and offset from the input pointer.
   */
 #if defined( __CUDACC__ )
  static void texture_object_attach(  Impl::AllocationTracker const & tracker
                                    , unsigned type_size
                                    , ::cudaChannelFormatDesc const & desc
                                   );
 #endif
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
  /*--------------------------------*/
  CudaSpace();
  CudaSpace( CudaSpace && rhs ) = default ;
  CudaSpace( const CudaSpace & rhs ) = default ;
@ -137,7 +107,7 @@ namespace Impl {
 /// where the hash value is derived from the address of the
 /// object for which an atomic operation is performed.
 /// This function initializes the locks to zero (unset).
-void init_lock_array_cuda_space();
+void init_lock_arrays_cuda_space();
 /// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
 ///
@ -146,7 +116,23 @@ void init_lock_array_cuda_space();
 /// object for which an atomic operation is performed.
 /// This function retrieves the lock array pointer.
 /// If the array is not yet allocated it will do so.
-int* lock_array_cuda_space_ptr(bool deallocate = false);
+int* atomic_lock_array_cuda_space_ptr(bool deallocate = false);
 /// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
 ///
 /// Team and Thread private scratch allocations in
 /// global memory are aquired via locks.
 /// This function retrieves the lock array pointer.
 /// If the array is not yet allocated it will do so.
 int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
 /// \brief Retrieve the pointer to the scratch array for unique identifiers.
 ///
 /// Unique identifiers in the range 0-Cuda::concurrency
 /// are provided via locks.
 /// This function retrieves the lock array pointer.
 /// If the array is not yet allocated it will do so.
 int* threadid_lock_array_cuda_space_ptr(bool deallocate = false);
 }
 } // namespace Kokkos
@ -172,33 +158,6 @@ public:
  /*--------------------------------*/
 #if ! KOKKOS_USING_EXP_VIEW
  typedef Impl::CudaUVMAllocator allocator;
  /** \brief  Allocate a contiguous block of memory.
   *
   *  The input label is associated with the block of memory.
   *  The block of memory is tracked via reference counting where
   *  allocation gives it a reference count of one.
   */
  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
  /** \brief  Cuda specific function to attached texture object to an allocation.
   *          Output the texture object, base pointer, and offset from the input pointer.
   */
 #if defined( __CUDACC__ )
  static void texture_object_attach(  Impl::AllocationTracker const & tracker
                                    , unsigned type_size
                                    , ::cudaChannelFormatDesc const & desc
                                   );
 #endif
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
  /*--------------------------------*/
  CudaUVMSpace();
  CudaUVMSpace( CudaUVMSpace && rhs ) = default ;
  CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
@ -242,22 +201,6 @@ public:
  /*--------------------------------*/
 #if ! KOKKOS_USING_EXP_VIEW
  typedef Impl::CudaHostAllocator allocator ;
  /** \brief  Allocate a contiguous block of memory.
   *
   *  The input label is associated with the block of memory.
   *  The block of memory is tracked via reference counting where
   *  allocation gives it a reference count of one.
   */
  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
  /*--------------------------------*/
  CudaHostPinnedSpace();
  CudaHostPinnedSpace( CudaHostPinnedSpace && rhs ) = default ;
  CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@ -47,167 +47,15 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_StaticAssert.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_AnalyzePolicy.hpp>
 #include <Kokkos_Concepts.hpp>
 #include <iostream>
 //----------------------------------------------------------------------------
 namespace Kokkos {
 //Schedules for Execution Policies
 struct Static {
 };
 struct Dynamic {
 };
 //Schedule Wrapper Type
 template<class ScheduleType>
 struct Schedule {
  static_assert(std::is_same<ScheduleType,Static>::value ||
                std::is_same<ScheduleType,Dynamic>::value,
                "Kokkos: Invalid Schedule<> type.");
  typedef Schedule<ScheduleType> schedule_type;
  typedef ScheduleType type;
 };
 //Specif Iteration Index Type
 template<typename iType>
 struct IndexType {
  static_assert(std::is_integral<iType>::value,"Kokkos: Invalid IndexType<>.");
  typedef IndexType<iType> index_type;
  typedef iType type;
 };
 namespace Impl {
 template<class Arg>
 struct is_schedule_type {
  enum { value = 0};
 };
 template<class ScheduleType>
 struct is_schedule_type<Schedule<ScheduleType> > {
  enum {value = 1 };
 };
 template<class Arg>
 struct is_index_type {
  enum { value = 0 };
 };
 template<typename iType>
 struct is_index_type<IndexType<iType> > {
  enum { value = 1 };
 };
 template<typename Arg>
 struct is_tag_type {
  enum { value = !(is_execution_space<Arg>::value ||
                   is_schedule_type<Arg>::value ||
                   is_index_type<Arg>::value ||
                   std::is_integral<Arg>::value)};
 };
 //Policy Traits
 template<class ... Properties>
 struct PolicyTraits;
 template<>
 struct PolicyTraits<void> {
  typedef void execution_space;
  typedef void schedule_type;
  typedef void index_type;
  typedef void tag_type;
 };
 //Strip off ExecutionSpace
 template<class ExecutionSpace, class ... Props>
 struct PolicyTraits<typename std::enable_if<is_execution_space<ExecutionSpace>::value >::type,ExecutionSpace,Props ...> {
  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::execution_space, void>::value,
                 "ExecutionPolicy: Only one execution space template argument may be used.");
  typedef ExecutionSpace execution_space;
  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
  typedef typename PolicyTraits<void, Props ...>::index_type index_type;
  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
 };
 //Strip off ScheduleType
 template<class ScheduleType, class ... Props>
 struct PolicyTraits<typename std::enable_if<is_schedule_type<Schedule<ScheduleType> >::value >::type,Schedule<ScheduleType>,Props ...> {
  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::schedule_type, void>::value,
                 "ExecutionPolicy: Only one Schedule<..> template argument may be used.");
  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
  typedef ScheduleType schedule_type;
  typedef typename PolicyTraits<void, Props ...>::index_type index_type;
  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
 };
 //Strip off IndexType
 template<typename iType, class ... Props>
 struct PolicyTraits<void, IndexType<iType>,Props ...> {
  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
                 "ExecutionPolicy: Only one IndexType<..> template argument may be used.");
  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
  typedef iType index_type;
  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
 };
 //Strip off raw IndexType
 template<typename iType, class ... Props>
 struct PolicyTraits<typename std::enable_if<std::is_integral<iType>::value>::type, iType,Props ...> {
  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
                 "ExecutionPolicy: Only one IndexType<..> template argument may be used.");
  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
  typedef iType index_type;
  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
 };
 //Strip off TagType
 template<class TagType, class ... Props>
 struct PolicyTraits<typename std::enable_if<!is_schedule_type<TagType>::value &&
                                            !is_execution_space<TagType>::value &&
                                            !is_index_type<TagType>::value &&
                                            !std::is_integral<TagType>::value 
                                           >::type,
                    TagType,Props ...> {
  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::tag_type, void>::value,
                 "ExecutionPolicy: Only one tag type template argument may be used.");
  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
  typedef typename PolicyTraits<void, Props ...>::index_type index_type;
  typedef TagType tag_type;
 };
 template<class ... Props>
 struct PolicyTraits {
 #ifdef KOKKOS_DIRECT_VARIADIC_EXPANSION
  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::execution_space>::value,
    Kokkos::DefaultExecutionSpace, typename PolicyTraits<void,Props ...>::execution_space>::type execution_space;
  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::schedule_type>::value,
    Kokkos::Static, typename PolicyTraits<void,Props ...>::schedule_type>::type schedule_type;
  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::index_type>::value,
    typename execution_space::size_type, typename PolicyTraits<void,Props ...>::index_type>::type index_type;
  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::tag_type>::value, 
    void, typename PolicyTraits<void,Props ...>::tag_type>::type work_tag;
 #else
  typedef typename has_condition<Kokkos::DefaultExecutionSpace,is_execution_space,Props ...>::type execution_space;
  typedef typename has_condition<Kokkos::Schedule<Kokkos::Static>,is_schedule_type,Props ...>::type schedule_type;
  typedef typename has_condition<void,is_tag_type,Props ...>::type work_tag;
  typedef typename has_condition<typename execution_space::size_type, std::is_integral, Props ... >::type default_index_type;
  typedef typename has_condition<Kokkos::IndexType<default_index_type>,is_index_type,Props ...>::type::type index_type;
 #endif
 };
 }
 }
 namespace Kokkos {
 /** \brief  Execution policy for work over a range of an integral type.
 *
 * Valid template argument options:
@ -230,7 +78,9 @@ namespace Kokkos {
 *  Blocking is the granularity of partitioning the range among threads.
 */
 template<class ... Properties>
-class RangePolicy: public Impl::PolicyTraits<Properties ... > {
+class RangePolicy
  : public Impl::PolicyTraits<Properties ... >
 {
 private:
  typedef Impl::PolicyTraits<Properties ... > traits;
@ -243,6 +93,7 @@ private:
 public:
  //! Tag this class as an execution policy
  typedef RangePolicy execution_policy;
  typedef typename traits::index_type member_type ;
  KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
@ -370,6 +221,7 @@ public:
  };
 };
 } // namespace Kokkos
 //----------------------------------------------------------------------------
@ -377,38 +229,6 @@ public:
 namespace Kokkos {
 namespace Experimental {
 /** \brief Scratch memory request accepting per team and per thread value
 *
 * An instance of this class can be given as the last argument to a 
 * TeamPolicy constructor. It sets the amount of user requested shared
 * memory for the team.
 */
 template< class MemorySpace >
 class TeamScratchRequest {
  size_t m_per_team;
  size_t m_per_thread;
 public:
  TeamScratchRequest(size_t per_team_, size_t per_thread_ = 0):
   m_per_team(per_team_), m_per_thread(per_thread_) {
  } 
  size_t per_team() const {
    return m_per_team;
  }
  size_t per_thread() const {
    return m_per_thread;
  }
  size_t total(const size_t team_size) const {
    return m_per_team + m_per_thread * team_size;
  }
 }; 
 }
 namespace Impl {
@ -451,11 +271,9 @@ public:
  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
-  template<class MemorySpace>
+/*  TeamPolicyInternal( int league_size_request , int team_size_request );
  TeamPolicyInternal( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
-  template<class MemorySpace>
+  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/
  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
  /** \brief  The actual league size (number of teams) of the policy.
   *
@ -574,9 +392,11 @@ class TeamPolicy: public
  typedef Impl::TeamPolicyInternal<
       typename Impl::PolicyTraits<Properties ... >::execution_space,
       Properties ...> internal_policy;
  typedef Impl::PolicyTraits<Properties ... > traits;
 public:
  typedef TeamPolicy execution_policy;
  TeamPolicy& operator = (const TeamPolicy&) = default;
@ -594,13 +414,11 @@ public:
  TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 )
    : internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {}
-  template<class MemorySpace>
+/*  TeamPolicy( int league_size_request , int team_size_request  )
-  TeamPolicy( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request )
+    : internal_policy(league_size_request,team_size_request) {}
    : internal_policy(league_size_request,team_size_request, team_scratch_memory_request) {}
-  template<class MemorySpace>
+  TeamPolicy( int league_size_request , const Kokkos::AUTO_t &  )
-  TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request )
+    : internal_policy(league_size_request,Kokkos::AUTO()) {}*/
    : internal_policy(league_size_request,Kokkos::AUTO(), team_scratch_memory_request) {}
 private:
  TeamPolicy(const internal_policy& p):internal_policy(p) {}
@ -744,6 +562,7 @@ Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(
 } // namespace Kokkos
 #endif /* #define KOKKOS_EXECPOLICY_HPP */
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@ -120,21 +120,6 @@ public:
  //! This memory space preferred device_type
  typedef Kokkos::Device<execution_space,memory_space> device_type;
  /*--------------------------------*/
 #if ! KOKKOS_USING_EXP_VIEW
  typedef Impl::HBWMallocAllocator allocator ;
  /** \brief  Allocate a contiguous block of memory.
   *
   *  The input label is associated with the block of memory.
   *  The block of memory is tracked via reference counting where
   *  allocation gives it a reference count of one.
   */
  static Kokkos::Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
  /*--------------------------------*/
  /* Functions unique to the HBWSpace */
  static int in_parallel();
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@ -55,9 +55,6 @@
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_AllocationTracker.hpp>
 #include <impl/Kokkos_BasicAllocators.hpp>
 #include <impl/KokkosExp_SharedAlloc.hpp>
 /*--------------------------------------------------------------------------*/
@ -128,25 +125,6 @@ public:
  //! This memory space preferred device_type
  typedef Kokkos::Device<execution_space,memory_space> device_type;
  /*--------------------------------*/
 #if ! KOKKOS_USING_EXP_VIEW
 #if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
  typedef Impl::PageAlignedAllocator allocator ;
 #else
  typedef Impl::AlignedAllocator allocator ;
 #endif
  /** \brief  Allocate a contiguous block of memory.
   *
   *  The input label is associated with the block of memory.
   *  The block of memory is tracked via reference counting where
   *  allocation gives it a reference count of one.
   */
  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
  /*--------------------------------*/
  /* Functions unique to the HostSpace */
  static int in_parallel();
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -133,11 +133,23 @@
 // still identifies as 7.0
 #error "Cuda version 7.5 or greater required for host-to-device Lambda support"
 #endif
 #if ( CUDA_VERSION < 8000 )
 #define KOKKOS_LAMBDA [=]__device__
 #else
 #define KOKKOS_LAMBDA [=]__host__ __device__
 #endif
 #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
 #endif
 #endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */
 #if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
   // Cuda version 8.0 still needs the functor wrapper
   #if (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ )
      #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
   #endif
 #endif
 /*--------------------------------------------------------------------------*/
 /* Language info: C++, CUDA, OPENMP */
@ -440,27 +452,16 @@
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
-/* Transitional macro to change between old and new View,
+/* Transitional macro to change between old and new View
- * default to use new View.
+ * are no longer supported.
 */
 #if ! defined( KOKKOS_USING_EXP_VIEW )
 #if defined( KOKKOS_USING_DEPRECATED_VIEW )
-#define KOKKOS_USING_EXP_VIEW 0
+#error "Kokkos deprecated View has been removed"
 #else
 #define KOKKOS_USING_EXP_VIEW 1
 #endif
 #endif
-#if KOKKOS_USING_EXP_VIEW
+#define KOKKOS_USING_EXP_VIEW 1
 #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
 #define KOKKOS_USING_EXPERIMENTAL_VIEW
 #endif
 #else /* ! KOKKOS_USING_EXP_VIEW */
 #if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
 #error "KOKKOS_USING_EXP_VIEW and KOKKOS_USING_EXPERIMENAL_VIEW are both defined and are incompatible"
 #endif
 #endif
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@ -58,9 +58,11 @@
 #endif
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskPolicy.hpp>
 #include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 /*--------------------------------------------------------------------------*/
 namespace Kokkos {
@ -177,6 +179,7 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <OpenMP/Kokkos_OpenMPexec.hpp>
 #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMP_Task.hpp>
 /*--------------------------------------------------------------------------*/
--- a/lib/kokkos/core/src/Kokkos_Pair.hpp
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@ -125,17 +125,26 @@ struct pair
    return *this;
  }
-  /// \brief Assignment operator.
+
  /// \brief Assignment operator, for volatile <tt>*this</tt>.
  ///
-  /// This calls the assignment operators of T1 and T2.  It won't
+  /// \param p [in] Input; right-hand side of the assignment.
  ///
  /// This calls the assignment operators of T1 and T2.  It will not
  /// compile if the assignment operators are not defined and public.
  ///
  /// This operator returns \c void instead of <tt>volatile pair<T1,
  /// T2>& </tt>.  See Kokkos Issue #177 for the explanation.  In
  /// practice, this means that you should not chain assignments with
  /// volatile lvalues.
  template <class U, class V>
  KOKKOS_FORCEINLINE_FUNCTION
-  volatile pair<T1, T2> & operator=(const volatile pair<U,V> &p) volatile
+  void operator=(const volatile pair<U,V> &p) volatile
  {
    first = p.first;
    second = p.second;
-    return *this;
+    // We deliberately do not return anything here.  See explanation
    // in public documentation above.
  }
  // from std::pair<U,V>
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@ -57,7 +57,6 @@
 #include <typeinfo>
 #endif
 #include <impl/Kokkos_AllocationTracker.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
@ -178,8 +177,8 @@ void parallel_for( const ExecPolicy  & policy
 {
 #if (KOKKOS_ENABLE_PROFILING)
    uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
-     	Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
@ -190,8 +189,8 @@ void parallel_for( const ExecPolicy  & policy
   closure.execute();
 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Experimental::endParallelFor(kpID);
+        Kokkos::Profiling::endParallelFor(kpID);
     }
 #endif
 }
@ -210,8 +209,8 @@ void parallel_for( const size_t        work_count
 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
-  	Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+  	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
@ -222,8 +221,8 @@ void parallel_for( const size_t        work_count
  closure.execute();
 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelFor(kpID);
+	Kokkos::Profiling::endParallelFor(kpID);
     }
 #endif
 }
@ -248,405 +247,9 @@ void parallel_for( const std::string & str
  (void) str;
 }
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 /** \brief  Parallel reduction
 *
 * Example of a parallel_reduce functor for a POD (plain old data) value type:
 * \code
 *  class FunctorType { // For POD value type
 *  public:
 *    typedef    ...     execution_space ;
 *    typedef <podType>  value_type ;
 *    void operator()( <intType> iwork , <podType> & update ) const ;
 *    void init( <podType> & update ) const ;
 *    void join( volatile       <podType> & update ,
 *               volatile const <podType> & input ) const ;
 *
 *    typedef true_type has_final ;
 *    void final( <podType> & update ) const ;
 *  };
 * \endcode
 *
 * Example of a parallel_reduce functor for an array of POD (plain old data) values:
 * \code
 *  class FunctorType { // For array of POD value
 *  public:
 *    typedef    ...     execution_space ;
 *    typedef <podType>  value_type[] ;
 *    void operator()( <intType> , <podType> update[] ) const ;
 *    void init( <podType> update[] ) const ;
 *    void join( volatile       <podType> update[] ,
 *               volatile const <podType> input[] ) const ;
 *
 *    typedef true_type has_final ;
 *    void final( <podType> update[] ) const ;
 *  };
 * \endcode
 */
 template< class ExecPolicy , class FunctorType >
 inline
 void parallel_reduce( const ExecPolicy  & policy
                    , const FunctorType & functor
                    , const std::string& str = ""
                    , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
                    )
 {
  // typedef typename
  //   Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
  //     execution_space ;
  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
                                     , typename ValueTraits::value_type
                                     , typename ValueTraits::pointer_type
                                     >::type value_type ;
  Kokkos::View< value_type
              , HostSpace
              , Kokkos::MemoryUnmanaged
              >
    result_view ;
 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
     if(Kokkos::Experimental::profileLibraryLoaded()) {
  	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
    Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
    Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , policy , result_view );
    Kokkos::Impl::shared_allocation_tracking_release_and_enable();
    closure.execute();
 #if (KOKKOS_ENABLE_PROFILING)
     if(Kokkos::Experimental::profileLibraryLoaded()) {
 	Kokkos::Experimental::endParallelReduce(kpID);
     }
 #endif
 }
-// integral range policy
+#include <Kokkos_Parallel_Reduce.hpp>
 template< class FunctorType >
 inline
 void parallel_reduce( const size_t        work_count
                    , const FunctorType & functor
                    , const std::string& str = ""
                    )
 {
  typedef typename
    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
      execution_space ;
  typedef RangePolicy< execution_space > policy ;
  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
                                     , typename ValueTraits::value_type
                                     , typename ValueTraits::pointer_type
                                     >::type value_type ;
  Kokkos::View< value_type
              , HostSpace
              , Kokkos::MemoryUnmanaged
              >
    result_view ;
 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
     if(Kokkos::Experimental::profileLibraryLoaded()) {
  	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
  Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
  closure.execute();
 #if (KOKKOS_ENABLE_PROFILING)
     if(Kokkos::Experimental::profileLibraryLoaded()) {
 	Kokkos::Experimental::endParallelReduce(kpID);
     }
 #endif
 }
 // general policy and view ouput
 template< class ExecPolicy , class FunctorType , class ViewType >
 inline
 void parallel_reduce( const ExecPolicy  & policy
                    , const FunctorType & functor
                    , const ViewType    & result_view
                    , const std::string& str = ""
                    , typename Impl::enable_if<
                      ( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
 #ifdef KOKKOS_HAVE_CUDA
                        && ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
 #endif
                      )>::type * = 0 )
 {
 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
     if(Kokkos::Experimental::profileLibraryLoaded()) {
 	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
  closure.execute();
 #if (KOKKOS_ENABLE_PROFILING)
     if(Kokkos::Experimental::profileLibraryLoaded()) {
 	Kokkos::Experimental::endParallelReduce(kpID);
     }
 #endif
 }
 // general policy and pod or array of pod output
 template< class ExecPolicy , class FunctorType >
 void parallel_reduce( const ExecPolicy  & policy
                    , const FunctorType & functor
 #ifdef KOKKOS_HAVE_CUDA
                    , typename Impl::enable_if<
                      ( ! Impl::is_integral< ExecPolicy >::value &&
                        ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )
                      , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref
                      , const std::string& str = ""
                      , typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0
                      )
 #else
                      , typename Impl::enable_if<
                        ( ! Impl::is_integral< ExecPolicy >::value)
                        , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type
                        >::type result_ref
                      , const std::string& str = ""
                        )
 #endif
 {
  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , typename ExecPolicy::work_tag >  ValueOps ;
  // Wrap the result output request in a view to inform the implementation
  // of the type and memory space.
  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
                                     , typename ValueTraits::value_type
                                     , typename ValueTraits::pointer_type
                                     >::type value_type ;
  Kokkos::View< value_type
              , HostSpace
              , Kokkos::MemoryUnmanaged
              >
    result_view( ValueOps::pointer( result_ref )
               , ValueTraits::value_count( functor )
               );
 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
     if(Kokkos::Experimental::profileLibraryLoaded()) {
 	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
  closure.execute();
 #if (KOKKOS_ENABLE_PROFILING)
     if(Kokkos::Experimental::profileLibraryLoaded()) {
 	Kokkos::Experimental::endParallelReduce(kpID);
     }
 #endif
 }
 // integral range policy and view ouput
 template< class FunctorType , class ViewType >
 inline
 void parallel_reduce( const size_t        work_count
                    , const FunctorType & functor
                    , const ViewType    & result_view
                    , const std::string& str = ""
                    , typename Impl::enable_if<( Kokkos::is_view<ViewType>::value
 #ifdef KOKKOS_HAVE_CUDA
                        && ! Impl::is_same<
                          typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
                          Kokkos::Cuda>::value
 #endif
                        )>::type * = 0 )
 {
  typedef typename
    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
      execution_space ;
  typedef RangePolicy< execution_space > ExecPolicy ;
 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
     if(Kokkos::Experimental::profileLibraryLoaded()) {
 	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view );
  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
  closure.execute();
 #if (KOKKOS_ENABLE_PROFILING)
     if(Kokkos::Experimental::profileLibraryLoaded()) {
 	Kokkos::Experimental::endParallelReduce(kpID);
     }
 #endif
 }
 // integral range policy and pod or array of pod output
 template< class FunctorType >
 inline
 void parallel_reduce( const size_t        work_count
                    , const FunctorType & functor
                    , typename Kokkos::Impl::FunctorValueTraits<
                         typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value ||
                                             Impl::is_integral<FunctorType>::value,
                            void,FunctorType>::type
                         , void >::reference_type result
                    , const std::string& str = ""
                    , typename Impl::enable_if< true
 #ifdef KOKKOS_HAVE_CUDA
                              && ! Impl::is_same<
                             typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
                             Kokkos::Cuda>::value
 #endif
                     >::type * = 0 )
 {
  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , void >  ValueOps ;
  typedef typename
    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
      execution_space ;
  typedef Kokkos::RangePolicy< execution_space > policy ;
  // Wrap the result output request in a view to inform the implementation
  // of the type and memory space.
  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
                                     , typename ValueTraits::value_type
                                     , typename ValueTraits::pointer_type
                                     >::type value_type ;
  Kokkos::View< value_type
              , HostSpace
              , Kokkos::MemoryUnmanaged
              >
    result_view( ValueOps::pointer( result )
               , ValueTraits::value_count( functor )
               );
 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
     if(Kokkos::Experimental::profileLibraryLoaded()) {
 	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
  Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
  closure.execute();
 #if (KOKKOS_ENABLE_PROFILING)
     if(Kokkos::Experimental::profileLibraryLoaded()) {
 	Kokkos::Experimental::endParallelReduce(kpID);
     }
 #endif
 }
 #ifndef KOKKOS_HAVE_CUDA
 template< class ExecPolicy , class FunctorType , class ResultType >
 inline
 void parallel_reduce( const std::string & str
                    , const ExecPolicy  & policy
                    , const FunctorType & functor
                    , ResultType * result)
 {
  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
  Kokkos::fence();
  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
  #endif
  parallel_reduce(policy,functor,result,str);
  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
  Kokkos::fence();
  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
  #endif
  (void) str;
 }
 template< class ExecPolicy , class FunctorType , class ResultType >
 inline
 void parallel_reduce( const std::string & str
                    , const ExecPolicy  & policy
                    , const FunctorType & functor
                    , ResultType & result)
 {
  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
  Kokkos::fence();
  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
  #endif
  parallel_reduce(policy,functor,result,str);
  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
  Kokkos::fence();
  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
  #endif
  (void) str;
 }
 template< class ExecPolicy , class FunctorType >
 inline
 void parallel_reduce( const std::string & str
                    , const ExecPolicy  & policy
                    , const FunctorType & functor)
 {
  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
  Kokkos::fence();
  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
  #endif
  parallel_reduce(policy,functor,str);
  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
  Kokkos::fence();
  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
  #endif
  (void) str;
 }
 #endif
 } // namespace Kokkos
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -816,8 +419,8 @@ void parallel_scan( const ExecutionPolicy & policy
 {
 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
@ -828,8 +431,8 @@ void parallel_scan( const ExecutionPolicy & policy
  closure.execute();
 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
+	Kokkos::Profiling::endParallelScan(kpID);
     }
 #endif
@ -849,8 +452,8 @@ void parallel_scan( const size_t        work_count
 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
@ -861,8 +464,8 @@ void parallel_scan( const size_t        work_count
  closure.execute();
 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
+	Kokkos::Profiling::endParallelScan(kpID);
     }
 #endif
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@ -66,11 +66,15 @@ public:
 private:
-  mutable char * m_iter ;
+  mutable char * m_iter_L0 ;
-  char *         m_end ;
+  char *         m_end_L0 ;
  mutable char * m_iter_L1 ;
  char *         m_end_L1 ;
  mutable int m_multiplier;
  mutable int m_offset;
  mutable int m_default_level;
  ScratchMemorySpace();
  ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
@ -95,34 +99,58 @@ public:
  template< typename IntType >
  KOKKOS_INLINE_FUNCTION
-  void* get_shmem (const IntType& size) const {
+  void* get_shmem (const IntType& size, int level = -1) const {
-    void* tmp = m_iter + m_offset * align (size);
+    if(level == -1)
-    if (m_end < (m_iter += align (size) * m_multiplier)) {
+      level = m_default_level;
-      m_iter -= align (size) * m_multiplier; // put it back like it was
+    if(level == 0) {
      void* tmp = m_iter_L0 + m_offset * align (size);
      if (m_end_L0 < (m_iter_L0 += align (size) * m_multiplier)) {
        m_iter_L0 -= align (size) * m_multiplier; // put it back like it was
        #ifdef KOKKOS_HAVE_DEBUG
        // mfh 23 Jun 2015: printf call consumes 25 registers
        // in a CUDA build, so only print in debug mode.  The
        // function still returns NULL if not enough memory.
        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
-              long(m_end-m_iter));
+                long(m_end_L0-m_iter_L0));
        #endif // KOKKOS_HAVE_DEBUG
        tmp = 0;
      }
      return tmp;
    } else {
      void* tmp = m_iter_L1 + m_offset * align (size);
      if (m_end_L1 < (m_iter_L1 += align (size) * m_multiplier)) {
        m_iter_L1 -= align (size) * m_multiplier; // put it back like it was
        #ifdef KOKKOS_HAVE_DEBUG
        // mfh 23 Jun 2015: printf call consumes 25 registers
        // in a CUDA build, so only print in debug mode.  The
        // function still returns NULL if not enough memory.
        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
                long(m_end_L1-m_iter_L1));
        #endif // KOKKOS_HAVE_DEBUG
        tmp = 0;
      }
      return tmp;
    }
  }
  template< typename IntType >
  KOKKOS_INLINE_FUNCTION
-  ScratchMemorySpace( void * ptr , const IntType & size )
+  ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
-    : m_iter( (char *) ptr )
+    : m_iter_L0( (char *) ptr_L0 )
-    , m_end(  m_iter + size )
+    , m_end_L0(  m_iter_L0 + size_L0 )
    , m_iter_L1( (char *) ptr_L1 )
    , m_end_L1(  m_iter_L1 + size_L1 )
    , m_multiplier( 1 )
    , m_offset( 0 )
    , m_default_level( 0 )
    {}
  KOKKOS_INLINE_FUNCTION
-  const ScratchMemorySpace& set_team_thread_mode(const int& multiplier, const int& offset) const {
+  const ScratchMemorySpace& set_team_thread_mode(const int& level, const int& multiplier, const int& offset) const {
    m_default_level = level;
    m_multiplier = multiplier;
    m_offset = offset;
    return *this;
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -50,12 +50,17 @@
 #include <cstddef>
 #include <iosfwd>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskPolicy.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 #if defined( KOKKOS_HAVE_SERIAL )
@ -142,7 +147,9 @@ public:
    // Init the array of locks used for arbitrarily sized atomics
    Impl::init_lock_array_host_space();
-
+    #if (KOKKOS_ENABLE_PROFILING)
      Kokkos::Profiling::initialize();
    #endif
  }
  static int is_initialized() { return 1 ; }
@ -151,7 +158,11 @@ public:
  static int concurrency() {return 1;};
  //! Free any resources being consumed by the device.
-  static void finalize() {}
+  static void finalize() {
    #if (KOKKOS_ENABLE_PROFILING)
      Kokkos::Profiling::finalize();
    #endif
  }
  //! Print configuration information to the given output stream.
  static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
@ -307,8 +318,8 @@ class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<
 {
 private:
-  size_t m_team_scratch_size ;
+  size_t m_team_scratch_size[2] ;
-  size_t m_thread_scratch_size ;
+  size_t m_thread_scratch_size[2] ;
  int    m_league_size ;
  int    m_chunk_size;
@ -324,8 +335,10 @@ public:
  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
    m_league_size = p.m_league_size;
-    m_team_scratch_size = p.m_team_scratch_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
-    m_thread_scratch_size = p.m_thread_scratch_size;
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
    m_team_scratch_size[1] = p.m_team_scratch_size[1];
    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
    m_chunk_size = p.m_chunk_size;
    return *this;
  }
@ -348,15 +361,15 @@ public:
  inline int team_size() const { return 1 ; }
  inline int league_size() const { return m_league_size ; }
-  inline size_t scratch_size() const { return m_team_scratch_size + m_thread_scratch_size; }
+  inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }
  /** \brief  Specify league size, request team size */
  TeamPolicyInternal( execution_space &
            , int league_size_request
            , int /* team_size_request */
            , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
-    , m_thread_scratch_size ( 0 )
+    , m_thread_scratch_size { 0 , 0 }
    , m_league_size( league_size_request )
    , m_chunk_size ( 32 )
    {}
@ -365,8 +378,8 @@ public:
            , int league_size_request
            , const Kokkos::AUTO_t & /* team_size_request */
            , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
-    , m_thread_scratch_size ( 0 )
+    , m_thread_scratch_size { 0 , 0 }
    , m_league_size( league_size_request )
    , m_chunk_size ( 32 )
    {}
@ -374,8 +387,8 @@ public:
  TeamPolicyInternal( int league_size_request
            , int /* team_size_request */
            , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
-    , m_thread_scratch_size ( 0 )
+    , m_thread_scratch_size { 0 , 0 }
    , m_league_size( league_size_request )
    , m_chunk_size ( 32 )
    {}
@ -383,8 +396,8 @@ public:
  TeamPolicyInternal( int league_size_request
            , const Kokkos::AUTO_t & /* team_size_request */
            , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
-    , m_thread_scratch_size ( 0 )
+    , m_thread_scratch_size { 0 , 0 }
    , m_league_size( league_size_request )
    , m_chunk_size ( 32 )
    {}
@ -401,26 +414,23 @@ public:
  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
    return p;
  };
  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };
  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };
@ -489,9 +499,10 @@ public:
 /*--------------------------------------------------------------------------*/
-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
                    , Kokkos::RangePolicy< Traits ... >
                    , ReducerType
                    , Kokkos::Serial
                    >
 {
@ -499,14 +510,19 @@ private:
  typedef Kokkos::RangePolicy< Traits ... > Policy ;
  typedef typename Policy::work_tag                                  WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
  typedef typename ReducerConditional::type ReducerTypeFwd;
  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;
  const FunctorType   m_functor ;
  const Policy        m_policy ;
  const ReducerType   m_reducer ;
  const pointer_type  m_result_ptr ;
@ -515,15 +531,15 @@ private:
  typename std::enable_if< std::is_same< TagType , void >::value >::type
  exec( pointer_type ptr ) const
    {
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
      const typename Policy::member_type e = m_policy.end();
      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
        m_functor( i , update );
      }
-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final( m_functor , ptr );
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }
  template< class TagType >
@ -532,15 +548,15 @@ private:
  exec( pointer_type ptr ) const
    {
      const TagType t{} ;
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
      const typename Policy::member_type e = m_policy.end();
      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
        m_functor( t , i , update );
      }
-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final( m_functor , ptr );
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }
 public:
@ -549,25 +565,43 @@ public:
  void execute() const
    {
      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size( m_functor ) , 0 );
+           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , 0 );
      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
    }
-  template< class ViewType >
+  template< class HostViewType >
-  ParallelReduce( const FunctorType & arg_functor
+  ParallelReduce( const FunctorType  & arg_functor ,
-                , const Policy      & arg_policy
+                  const Policy       & arg_policy ,
-                , const ViewType    & arg_result )
+                  const HostViewType & arg_result_view ,
                  typename std::enable_if<
                               Kokkos::is_view< HostViewType >::value &&
                              !Kokkos::is_reducer_type<ReducerType>::value
                  ,void*>::type = NULL)
    : m_functor( arg_functor )
    , m_policy( arg_policy )
-    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result_view.ptr_on_device() )
    {
-      static_assert( Kokkos::is_view< ViewType >::value
+      static_assert( Kokkos::is_view< HostViewType >::value
-        , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
+        , "Kokkos::Serial reduce result must be a View" );
-      static_assert( std::is_same< typename ViewType::memory_space
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
        , "Kokkos::Serial reduce result must be a View in HostSpace" );
    }
  inline
  ParallelReduce( const FunctorType & arg_functor
                , Policy       arg_policy
                , const ReducerType& reducer )
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( reducer )
    , m_result_ptr(  reducer.result_view().data() )
    {
      /*static_assert( std::is_same< typename ViewType::memory_space
                                      , Kokkos::HostSpace >::value
-        , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
    }
 };
@ -697,15 +731,16 @@ public:
             , const Policy      & arg_policy )
    : m_functor( arg_functor )
    , m_league(  arg_policy.league_size() )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
    { }
 };
 /*--------------------------------------------------------------------------*/
-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType , class ... Properties >
 class ParallelReduce< FunctorType
                    , Kokkos::TeamPolicy< Properties ... >
                    , ReducerType
                    , Kokkos::Serial
                    >
 {
@ -714,30 +749,35 @@ private:
  typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
  typedef typename Policy::member_type                       Member ;
  typedef typename Policy::work_tag                          WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
+
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag > ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
  typedef typename ReducerConditional::type ReducerTypeFwd;
  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;
  const FunctorType  m_functor ;
  const int          m_league ;
-  const int          m_shared ;
+  const ReducerType  m_reducer ;
        pointer_type m_result_ptr ;
  const int          m_shared ;
  template< class TagType >
  inline
  typename std::enable_if< std::is_same< TagType , void >::value >::type
  exec( pointer_type ptr ) const
    {
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
        m_functor( Member(ileague,m_league,m_shared) , update );
      }
-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final( m_functor , ptr );
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }
  template< class TagType >
@ -747,14 +787,14 @@ private:
    {
      const TagType t{} ;
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
        m_functor( t , Member(ileague,m_league,m_shared) , update );
      }
-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final( m_functor , ptr );
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }
 public:
@ -763,7 +803,7 @@ public:
  void execute() const
    {
      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size( m_functor ) , m_shared );
+           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
    }
@ -771,12 +811,16 @@ public:
  template< class ViewType >
  ParallelReduce( const FunctorType  & arg_functor
                , const Policy       & arg_policy
-                , const ViewType     & arg_result
+                , const ViewType     & arg_result ,
-                )
+                typename std::enable_if<
                  Kokkos::is_view< ViewType >::value &&
                  !Kokkos::is_reducer_type<ReducerType>::value
                  ,void*>::type = NULL)
    : m_functor( arg_functor )
    , m_league( arg_policy.league_size() )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
+    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result.ptr_on_device() )
    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
    {
      static_assert( Kokkos::is_view< ViewType >::value
        , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
@ -786,6 +830,21 @@ public:
        , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
    }
  inline
  ParallelReduce( const FunctorType & arg_functor
    , Policy       arg_policy
    , const ReducerType& reducer )
  : m_functor( arg_functor )
  , m_league(  arg_policy.league_size() )
  , m_reducer( reducer )
  , m_result_ptr(  reducer.result_view().data() )
  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
  {
  /*static_assert( std::is_same< typename ViewType::memory_space
                          , Kokkos::HostSpace >::value
  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
  }
 };
 } // namespace Impl
@ -1045,6 +1104,10 @@ void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const Func
 }
 }
 //----------------------------------------------------------------------------
 #include <impl/Kokkos_Serial_Task.hpp>
 #endif // defined( KOKKOS_HAVE_SERIAL )
 #endif /* #define KOKKOS_SERIAL_HPP */
--- a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
@ -1,4 +1,3 @@
 /*
 //@HEADER
 // ************************************************************************
@ -47,13 +46,655 @@
 #ifndef KOKKOS_TASKPOLICY_HPP
 #define KOKKOS_TASKPOLICY_HPP
-#include <Kokkos_Core_fwd.hpp>
+//----------------------------------------------------------------------------
 #include <Kokkos_MemoryPool.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_StaticAssert.hpp>
 #include <impl/Kokkos_AllocationTracker.hpp>
 #include <Kokkos_Core_fwd.hpp>
 // If compiling with CUDA then must be using CUDA 8 or better
 // and use relocateable device code to enable the task policy.
 // nvcc relocatable device code option: --relocatable-device-code=true
 #if ( defined( KOKKOS_COMPILER_NVCC ) )
  #if ( 8000 <= CUDA_VERSION ) && \
      defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
  #define KOKKOS_ENABLE_TASKPOLICY
  #endif
 #else
 #define KOKKOS_ENABLE_TASKPOLICY
 #endif
 #if defined( KOKKOS_ENABLE_TASKPOLICY )
 //----------------------------------------------------------------------------
 #include <Kokkos_MemoryPool.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
 //----------------------------------------------------------------------------
 namespace Kokkos {
 enum TaskType { TaskTeam   = Impl::TaskBase<void,void,void>::TaskTeam
              , TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
 enum TaskPriority { TaskHighPriority    = 0
                  , TaskRegularPriority = 1
                  , TaskLowPriority     = 2 };
 template< typename Space >
 class TaskPolicy ;
 template< typename Space >
 void wait( TaskPolicy< Space > const & );
 } // namespace Kokkos
 //----------------------------------------------------------------------------
 namespace Kokkos {
 namespace Impl {
 /*\brief  Implementation data for task data management, access, and execution.
 *
 *  CRTP Inheritance structure to allow static_cast from the
 *  task root type and a task's FunctorType.
 *
 *    TaskBase< Space , ResultType , FunctorType >
 *      : TaskBase< Space , ResultType , void >
 *      , FunctorType
 *      { ... };
 *
 *    TaskBase< Space , ResultType , void >
 *      : TaskBase< Space , void , void >
 *      { ... };
 */
 template< typename Space , typename ResultType , typename FunctorType >
 class TaskBase ;
 template< typename Space >
 class TaskExec ;
 }} // namespace Kokkos::Impl
 //----------------------------------------------------------------------------
 namespace Kokkos {
 /**
 *
 *  Future< space >  // value_type == void
 *  Future< value >  // space == Default
 *  Future< value , space >
 *
 */
 template< typename Arg1 /* = void */ , typename Arg2 /* = void */ >
 class Future {
 private:
  template< typename > friend class TaskPolicy ;
  template< typename , typename > friend class Future ;
  template< typename , typename , typename > friend class Impl::TaskBase ;
  enum { Arg1_is_space  = Kokkos::Impl::is_space< Arg1 >::value };
  enum { Arg2_is_space  = Kokkos::Impl::is_space< Arg2 >::value };
  enum { Arg1_is_value  = ! Arg1_is_space &&
                          ! std::is_same< Arg1 , void >::value };
  enum { Arg2_is_value  = ! Arg2_is_space &&
                          ! std::is_same< Arg2 , void >::value };
  static_assert( ! ( Arg1_is_space && Arg2_is_space )
               , "Future cannot be given two spaces" );
  static_assert( ! ( Arg1_is_value && Arg2_is_value )
               , "Future cannot be given two value types" );
  using ValueType =
    typename std::conditional< Arg1_is_value , Arg1 ,
    typename std::conditional< Arg2_is_value , Arg2 , void
    >::type >::type ;
  using Space =
    typename std::conditional< Arg1_is_space , Arg1 ,
    typename std::conditional< Arg2_is_space , Arg2 , void
    >::type >::type ;
  using task_base  = Impl::TaskBase< Space , ValueType , void > ;
  using queue_type = Impl::TaskQueue< Space > ;
  task_base * m_task ;
  KOKKOS_INLINE_FUNCTION explicit
  Future( task_base * task ) : m_task(0)
    { if ( task ) queue_type::assign( & m_task , task ); }
  //----------------------------------------
 public:
  using execution_space = typename Space::execution_space ;
  using value_type      = ValueType ;
  //----------------------------------------
  KOKKOS_INLINE_FUNCTION
  bool is_null() const { return 0 == m_task ; }
  KOKKOS_INLINE_FUNCTION
  int reference_count() const
    { return 0 != m_task ? m_task->reference_count() : 0 ; }
  //----------------------------------------
  KOKKOS_INLINE_FUNCTION
  ~Future() { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); }
  //----------------------------------------
  KOKKOS_INLINE_FUNCTION
  constexpr Future() noexcept : m_task(0) {}
  KOKKOS_INLINE_FUNCTION
  Future( Future && rhs )
    : m_task( rhs.m_task ) { rhs.m_task = 0 ; }
  KOKKOS_INLINE_FUNCTION
  Future( const Future & rhs )
    : m_task(0)
    { if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); }
  KOKKOS_INLINE_FUNCTION
  Future & operator = ( Future && rhs )
    {
      if ( m_task ) queue_type::assign( & m_task , (task_base*)0 );
      m_task = rhs.m_task ;
      rhs.m_task = 0 ;
      return *this ;
    }
  KOKKOS_INLINE_FUNCTION
  Future & operator = ( const Future & rhs )
    {
      if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
      return *this ;
    }
  //----------------------------------------
  template< class A1 , class A2 >
  KOKKOS_INLINE_FUNCTION
  Future( Future<A1,A2> && rhs )
    : m_task( rhs.m_task )
    {
      static_assert
        ( std::is_same< Space , void >::value ||
          std::is_same< Space , typename Future<A1,A2>::Space >::value
        , "Assigned Futures must have the same space" );
      static_assert
        ( std::is_same< value_type , void >::value ||
          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
        , "Assigned Futures must have the same value_type" );
      rhs.m_task = 0 ;
    }
  template< class A1 , class A2 >
  KOKKOS_INLINE_FUNCTION
  Future( const Future<A1,A2> & rhs )
    : m_task(0)
    {
      static_assert
        ( std::is_same< Space , void >::value ||
          std::is_same< Space , typename Future<A1,A2>::Space >::value
        , "Assigned Futures must have the same space" );
      static_assert
        ( std::is_same< value_type , void >::value ||
          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
        , "Assigned Futures must have the same value_type" );
      if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
    }
  template< class A1 , class A2 >
  KOKKOS_INLINE_FUNCTION
  Future & operator = ( const Future<A1,A2> & rhs )
    {
      static_assert
        ( std::is_same< Space , void >::value ||
          std::is_same< Space , typename Future<A1,A2>::Space >::value
        , "Assigned Futures must have the same space" );
      static_assert
        ( std::is_same< value_type , void >::value ||
          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
        , "Assigned Futures must have the same value_type" );
      if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
      return *this ;
    }
  template< class A1 , class A2 >
  KOKKOS_INLINE_FUNCTION
  Future & operator = ( Future<A1,A2> && rhs )
    {
      static_assert
        ( std::is_same< Space , void >::value ||
          std::is_same< Space , typename Future<A1,A2>::Space >::value
        , "Assigned Futures must have the same space" );
      static_assert
        ( std::is_same< value_type , void >::value ||
          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
        , "Assigned Futures must have the same value_type" );
      if ( m_task ) queue_type::assign( & m_task , (task_base*) 0 );
      m_task = rhs.m_task ;
      rhs.m_task = 0 ;
      return *this ;
    }
  //----------------------------------------
  KOKKOS_INLINE_FUNCTION
  typename task_base::get_return_type
  get() const
    {
      if ( 0 == m_task ) {
        Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
      }
      return m_task->get();
    }
 };
 } // namespace Kokkos
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 namespace Kokkos {
 template< typename ExecSpace >
 class TaskPolicy
 {
 private:
  using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
  using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
  using task_base  = Impl::TaskBase< ExecSpace , void , void > ;
  track_type   m_track ;
  queue_type * m_queue ;
  //----------------------------------------
  // Process optional arguments to spawn and respawn functions
  KOKKOS_INLINE_FUNCTION static
  void assign( task_base * const ) {}
  // TaskTeam or TaskSingle
  template< typename ... Options >
  KOKKOS_INLINE_FUNCTION static
  void assign( task_base * const task
             , TaskType const & arg
             , Options const & ... opts )
    {
      task->m_task_type = arg ;
      assign( task , opts ... );
    }
  // TaskHighPriority or TaskRegularPriority or TaskLowPriority
  template< typename ... Options >
  KOKKOS_INLINE_FUNCTION static
  void assign( task_base * const task
             , TaskPriority const & arg
             , Options const & ... opts )
    {
      task->m_priority = arg ;
      assign( task , opts ... );
    }
  // Future for a dependence
  template< typename A1 , typename A2 , typename ... Options >
  KOKKOS_INLINE_FUNCTION static
  void assign( task_base * const task
             , Future< A1 , A2 > const & arg 
             , Options const & ... opts )
    {
      // Assign dependence to task->m_next
      // which will be processed within subsequent call to schedule.
      // Error if the dependence is reset.
      if ( 0 != Kokkos::atomic_exchange(& task->m_next, arg.m_task) ) {
        Kokkos::abort("TaskPolicy ERROR: resetting task dependence");
      }
      if ( 0 != arg.m_task ) {
        // The future may be destroyed upon returning from this call
        // so increment reference count to track this assignment.
        Kokkos::atomic_fetch_add( &(arg.m_task->m_ref_count) , 1 );
      }
      assign( task , opts ... );
    }
  //----------------------------------------
 public:
  using execution_policy = TaskPolicy ;
  using execution_space  = ExecSpace ;
  using memory_space     = typename queue_type::memory_space ;
  using member_type      = Kokkos::Impl::TaskExec< ExecSpace > ;
  KOKKOS_INLINE_FUNCTION
  TaskPolicy() : m_track(), m_queue(0) {}
  KOKKOS_INLINE_FUNCTION
  TaskPolicy( TaskPolicy && rhs ) = default ;
  KOKKOS_INLINE_FUNCTION
  TaskPolicy( TaskPolicy const & rhs ) = default ;
  KOKKOS_INLINE_FUNCTION
  TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
  KOKKOS_INLINE_FUNCTION
  TaskPolicy & operator = ( TaskPolicy const & rhs ) = default ;
  TaskPolicy( memory_space const & arg_memory_space
            , unsigned const arg_memory_pool_capacity
            , unsigned const arg_memory_pool_log2_superblock = 12 )
    : m_track()
    , m_queue(0)
    {
      typedef Kokkos::Experimental::Impl::SharedAllocationRecord
        < memory_space , typename queue_type::Destroy >
          record_type ;
      record_type * record =
        record_type::allocate( arg_memory_space
                             , "TaskQueue"
                             , sizeof(queue_type)
                             );
      m_queue = new( record->data() )
        queue_type( arg_memory_space
                  , arg_memory_pool_capacity
                  , arg_memory_pool_log2_superblock );
      record->m_destroy.m_queue = m_queue ;
      m_track.assign_allocated_record_to_uninitialized( record );
    }
  //----------------------------------------
  /**\brief  Allocation size for a spawned task */
  template< typename FunctorType >
  KOKKOS_FUNCTION
  size_t spawn_allocation_size() const
    {
      using task_type  = Impl::TaskBase< execution_space
                                       , typename FunctorType::value_type
                                       , FunctorType > ;
      return m_queue->allocate_block_size( sizeof(task_type) );
    }
  /**\brief  Allocation size for a when_all aggregate */
  KOKKOS_FUNCTION
  size_t when_all_allocation_size( int narg ) const
    {
      using task_base  = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
      return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
    }
  //----------------------------------------
  /**\brief  A task spawns a task with options
   *
   *  1) High, Normal, or Low priority
   *  2) With or without dependence
   *  3) Team or Serial
   */
  template< typename FunctorType , typename ... Options >
  KOKKOS_FUNCTION
  Future< typename FunctorType::value_type , ExecSpace >
  task_spawn( FunctorType const & arg_functor 
            , Options const & ... arg_options
            ) const
    {
      using value_type  = typename FunctorType::value_type ;
      using future_type = Future< value_type , execution_space > ;
      using task_type   = Impl::TaskBase< execution_space
                                        , value_type
                                        , FunctorType > ;
      //----------------------------------------
      // Give single-thread back-ends an opportunity to clear
      // queue of ready tasks before allocating a new task
      m_queue->iff_single_thread_recursive_execute();
      //----------------------------------------
      future_type f ;
      // Allocate task from memory pool
      f.m_task =
        reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
      if ( f.m_task ) {
        // Placement new construction
        new ( f.m_task ) task_type( arg_functor );
        // Reference count starts at two
        // +1 for matching decrement when task is complete
        // +1 for future
        f.m_task->m_queue      = m_queue ;
        f.m_task->m_ref_count  = 2 ;
        f.m_task->m_alloc_size = sizeof(task_type);
        assign( f.m_task , arg_options... );
        // Spawning from within the execution space so the
        // apply function pointer is guaranteed to be valid
        f.m_task->m_apply = task_type::apply ;
        m_queue->schedule( f.m_task );
        // this task may be updated or executed at any moment
      }
      return f ;
    }
  /**\brief  The host process spawns a task with options
   *
   *  1) High, Normal, or Low priority
   *  2) With or without dependence
   *  3) Team or Serial
   */
  template< typename FunctorType , typename ... Options >
  inline
  Future< typename FunctorType::value_type , ExecSpace >
  host_spawn( FunctorType const & arg_functor 
            , Options const & ... arg_options
            ) const
    {
      using value_type  = typename FunctorType::value_type ;
      using future_type = Future< value_type , execution_space > ;
      using task_type   = Impl::TaskBase< execution_space
                                        , value_type
                                        , FunctorType > ;
      future_type f ;
      // Allocate task from memory pool
      f.m_task = 
        reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
      if ( f.m_task ) {
        // Placement new construction
        new( f.m_task ) task_type( arg_functor );
        // Reference count starts at two:
        // +1 to match decrement when task completes
        // +1 for the future
        f.m_task->m_queue      = m_queue ;
        f.m_task->m_ref_count  = 2 ;
        f.m_task->m_alloc_size = sizeof(task_type);
        assign( f.m_task , arg_options... );
        // Potentially spawning outside execution space so the
        // apply function pointer must be obtained from execution space.
        // Required for Cuda execution space function pointer.
        queue_type::specialization::template
          proc_set_apply< FunctorType >( & f.m_task->m_apply );
        m_queue->schedule( f.m_task );
      }
      return f ;
    }
  /**\brief  Return a future that is complete
   *         when all input futures are complete.
   */
  template< typename A1 , typename A2 >
  KOKKOS_FUNCTION
  Future< ExecSpace >
  when_all( int narg , Future< A1 , A2 > const * const arg ) const
    {
      static_assert
        ( std::is_same< execution_space
                      , typename Future< A1 , A2 >::execution_space
                      >::value
        , "Future must have same execution space" );
      using future_type = Future< ExecSpace > ;
      using task_base   = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
      future_type f ;
      size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);
      f.m_task =
        reinterpret_cast< task_base * >( m_queue->allocate( size ) );
      if ( f.m_task ) {
        new( f.m_task ) task_base();
        // Reference count starts at two:
        // +1 to match decrement when task completes
        // +1 for the future
        f.m_task->m_queue      = m_queue ;
        f.m_task->m_ref_count  = 2 ;
        f.m_task->m_alloc_size = size ;
        f.m_task->m_dep_count  = narg ;
        f.m_task->m_task_type  = task_base::Aggregate ;
        task_base ** const dep = f.m_task->aggregate_dependences();
        // Assign dependences to increment their reference count
        // The futures may be destroyed upon returning from this call
        // so increment reference count to track this assignment.
        for ( int i = 0 ; i < narg ; ++i ) {
          task_base * const t = dep[i] = arg[i].m_task ;
          if ( 0 != t ) {
            Kokkos::atomic_fetch_add( &(t->m_ref_count) , 1 );
          }
        }
        m_queue->schedule( f.m_task );
        // this when_all may be processed at any moment
      }
      return f ;
    }
  /**\brief  An executing task respawns itself with options
   *
   *  1) High, Normal, or Low priority
   *  2) With or without dependence
   */
  template< class FunctorType , typename ... Options >
  KOKKOS_FUNCTION
  void respawn( FunctorType * task_self
              , Options const & ... arg_options ) const
    {
      using value_type  = typename FunctorType::value_type ;
      using task_type   = Impl::TaskBase< execution_space
                                        , value_type
                                        , FunctorType > ;
      task_base * const zero = (task_base *) 0 ;
      task_base * const lock = (task_base *) task_base::LockTag ;
      task_type * const task = static_cast< task_type * >( task_self );
      // Precondition:
      //   task is in Executing state
      //   therefore  m_next == LockTag
      //
      // Change to m_next == 0 for no dependence
      if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
        Kokkos::abort("TaskPolicy::respawn ERROR: already respawned");
      }
      assign( task , arg_options... );
      // Postcondition:
      //   task is in Executing-Respawn state
      //   therefore  m_next == dependece or 0
    }
  //----------------------------------------
  template< typename S >
  friend
  void Kokkos::wait( Kokkos::TaskPolicy< S > const & );
  //----------------------------------------
  inline
  int allocation_capacity() const noexcept
    { return m_queue->m_memory.get_mem_size(); }
  KOKKOS_INLINE_FUNCTION
  int allocated_task_count() const noexcept
    { return m_queue->m_count_alloc ; }
  KOKKOS_INLINE_FUNCTION
  int allocated_task_count_max() const noexcept
    { return m_queue->m_max_alloc ; }
  KOKKOS_INLINE_FUNCTION
  long allocated_task_count_accum() const noexcept
    { return m_queue->m_accum_alloc ; }
 };
 template< typename ExecSpace >
 inline
 void wait( TaskPolicy< ExecSpace > const & policy )
 { policy.m_queue->execute(); }
 } // namespace Kokkos
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 namespace Kokkos {
@ -463,5 +1104,6 @@ void wait( TaskPolicy< ExecSpace > & );
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
-#endif /* #define KOKKOS_TASKPOLICY_HPP */
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #ifndef KOKKOS_TASKPOLICY_HPP */
--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@ -211,6 +211,8 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Threads/Kokkos_ThreadsTeam.hpp>
 #include <Threads/Kokkos_Threads_Parallel.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@ -178,9 +178,10 @@ public:
 namespace Kokkos {
 namespace Impl {
-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
                    , Kokkos::RangePolicy< Traits ...>
                    , ReducerType
                    , Kokkos::OpenMP
                    >
 {
@ -192,15 +193,21 @@ private:
  typedef typename Policy::WorkRange    WorkRange ;
  typedef typename Policy::member_type  Member ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
-  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType, WorkTag > ValueJoin ;
+
  // Static Assert WorkTag void if ReducerType not InvalidType
  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;
  const FunctorType   m_functor ;
  const Policy        m_policy ;
  const ReducerType   m_reducer ;
  const pointer_type  m_result_ptr ;
  template< class TagType >
@ -252,7 +259,7 @@ public:
      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
-      OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
 #pragma omp parallel
      {
@ -260,7 +267,7 @@ public:
        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
        ParallelReduce::template exec_range< WorkTag >
          ( m_functor , range.begin() , range.end()
-          , ValueInit::init( m_functor , exec.scratch_reduce() ) );
+          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
      }
 /* END #pragma omp parallel */
@ -269,13 +276,13 @@ public:
      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
      }
-      Kokkos::Impl::FunctorFinal<  FunctorType , WorkTag >::final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
      if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( m_functor );
+        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
      }
@ -289,7 +296,7 @@ public:
      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
-      OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
 #pragma omp parallel
      {
@ -302,7 +309,7 @@ public:
        long work_index = exec.get_work_index();
-        reference_type update = ValueInit::init( m_functor , exec.scratch_reduce() );
+        reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
        while(work_index != -1) {
          const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
          const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
@ -319,13 +326,13 @@ public:
      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
      }
-      Kokkos::Impl::FunctorFinal<  FunctorType , WorkTag >::final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
      if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( m_functor );
+        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
      }
@ -337,18 +344,35 @@ public:
  inline
  ParallelReduce( const FunctorType & arg_functor
                , Policy       arg_policy
-                , const ViewType    & arg_result_view )
+                , const ViewType    & arg_result_view
                , typename std::enable_if<
                           Kokkos::is_view< ViewType >::value &&
                           !Kokkos::is_reducer_type<ReducerType>::value
                  ,void*>::type = NULL)
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
-    , m_result_ptr(  arg_result_view.ptr_on_device() )
+    , m_reducer( InvalidType() )
    , m_result_ptr(  arg_result_view.data() )
    {
-      static_assert( Kokkos::is_view< ViewType >::value
+      /*static_assert( std::is_same< typename ViewType::memory_space
        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View" );
      static_assert( std::is_same< typename ViewType::memory_space
                                      , Kokkos::HostSpace >::value
-        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
    }
  inline
  ParallelReduce( const FunctorType & arg_functor
                , Policy       arg_policy
                , const ReducerType& reducer )
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( reducer )
    , m_result_ptr(  reducer.result_view().data() )
    {
      /*static_assert( std::is_same< typename ViewType::memory_space
                                      , Kokkos::HostSpace >::value
        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
    }
 };
 } // namespace Impl
@ -568,13 +592,13 @@ public:
      const size_t team_reduce_size = Policy::member_type::team_reduce_size();
-      OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size );
+      OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
 #pragma omp parallel
      {
        ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
          ( m_functor
-          , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size) );
+          , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
      }
 /* END #pragma omp parallel */
    }
@ -584,14 +608,15 @@ public:
               const Policy      & arg_policy )
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
-    , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
    {}
 };
-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType, class ... Properties >
 class ParallelReduce< FunctorType
                    , Kokkos::TeamPolicy< Properties ... >
                    , ReducerType
                    , Kokkos::OpenMP
                    >
 {
@ -602,15 +627,19 @@ private:
  typedef typename Policy::work_tag     WorkTag ;
  typedef typename Policy::member_type  Member ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
-  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
+
  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTag >  ValueJoin ;
  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;
  const FunctorType  m_functor ;
  const Policy       m_policy ;
  const ReducerType  m_reducer ;
  const pointer_type m_result_ptr ;
  const int          m_shmem_size ;
@ -644,7 +673,7 @@ public:
      const size_t team_reduce_size = Policy::member_type::team_reduce_size();
-      OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , team_reduce_size + m_shmem_size );
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
 #pragma omp parallel
      {
@ -652,8 +681,8 @@ public:
        ParallelReduce::template exec_team< WorkTag >
          ( m_functor
-          , Member( exec , m_policy , m_shmem_size )
+          , Member( exec , m_policy , m_shmem_size, 0 )
-          , ValueInit::init( m_functor , exec.scratch_reduce() ) );
+          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
      }
 /* END #pragma omp parallel */
@ -665,13 +694,13 @@ public:
          max_active_threads = m_policy.league_size()* m_policy.team_size();
        for ( int i = 1 ; i < max_active_threads ; ++i ) {
-          ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+          ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
        }
-        Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
        if ( m_result_ptr ) {
-          const int n = ValueTraits::value_count( m_functor );
+          const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
          for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
        }
@ -682,12 +711,33 @@ public:
  inline
  ParallelReduce( const FunctorType  & arg_functor ,
                  const Policy       & arg_policy ,
-                  const ViewType     & arg_result )
+                  const ViewType     & arg_result ,
                  typename std::enable_if<
                    Kokkos::is_view< ViewType >::value &&
                    !Kokkos::is_reducer_type<ReducerType>::value
                    ,void*>::type = NULL)
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
    {}
  inline
  ParallelReduce( const FunctorType & arg_functor
    , Policy       arg_policy
    , const ReducerType& reducer )
  : m_functor( arg_functor )
  , m_policy(  arg_policy )
  , m_reducer( reducer )
  , m_result_ptr(  reducer.result_view().data() )
  , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
  {
  /*static_assert( std::is_same< typename ViewType::memory_space
                          , Kokkos::HostSpace >::value
  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
  }
 };
 } // namespace Impl
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@ -0,0 +1,329 @@
 /*
 //@HEADER
 // ************************************************************************
 // 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 // 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 // 
 // ************************************************************************
 //@HEADER
 */
 #include <Kokkos_Core.hpp>
 #if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY )
 #include <impl/Kokkos_TaskQueue_impl.hpp>
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 namespace Kokkos {
 namespace Impl {
 template class TaskQueue< Kokkos::OpenMP > ;
 //----------------------------------------------------------------------------
 TaskExec< Kokkos::OpenMP >::
 TaskExec()
  : m_self_exec( 0 )
  , m_team_exec( 0 )
  , m_sync_mask( 0 )
  , m_sync_value( 0 )
  , m_sync_step( 0 )
  , m_group_rank( 0 )
  , m_team_rank( 0 )
  , m_team_size( 1 )
 {
 }
 TaskExec< Kokkos::OpenMP >::
 TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
  : m_self_exec( & arg_exec )
  , m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
  , m_sync_mask( 0 )
  , m_sync_value( 0 )
  , m_sync_step( 0 )
  , m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
  , m_team_rank(  arg_exec.pool_rank_rev() % arg_team_size )
  , m_team_size(  arg_team_size )
 {
  // This team spans
  //    m_self_exec->pool_rev( team_size * group_rank )
  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
  sync[0] = int64_t(0) ;
  sync[1] = int64_t(0) ;
  for ( int i = 0 ; i < m_team_size ; ++i ) {
    m_sync_value |= int64_t(1) << (8*i);
    m_sync_mask  |= int64_t(3) << (8*i);
  }
  Kokkos::memory_fence();
 }
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
 void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
 {
  if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
    Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
  }
  // Use team shared memory to synchronize.
  // Alternate memory locations between barriers to avoid a sequence
  // of barriers overtaking one another.
  int64_t volatile * const sync =
    ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
  // This team member sets one byte within the sync variable
  int8_t volatile * const sync_self =
   ((int8_t *) sync) + m_team_rank ;
 #if 0
 fprintf( stdout
       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
       , m_group_rank
       , m_team_rank
       , m_sync_step
       , m_sync_value
       , *sync
       );
 fflush(stdout);
 #endif
  *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
  while ( m_sync_value != *sync ); // wait for team to arrive
 #if 0
 fprintf( stdout
       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
       , m_group_rank
       , m_team_rank
       , m_sync_step
       , m_sync_value
       , *sync
       );
 fflush(stdout);
 #endif
  ++m_sync_step ;
  if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
    m_sync_value ^= m_sync_mask ;
    if ( 1000 < m_sync_step ) m_sync_step = 0 ;
  }
 }
 #endif
 //----------------------------------------------------------------------------
 void TaskQueueSpecialization< Kokkos::OpenMP >::execute
  ( TaskQueue< Kokkos::OpenMP > * const queue )
 {
  using execution_space = Kokkos::OpenMP ;
  using queue_type      = TaskQueue< execution_space > ;
  using task_root_type  = TaskBase< execution_space , void , void > ;
  using PoolExec        = Kokkos::Impl::OpenMPexec ;
  using Member          = TaskExec< execution_space > ;
  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
  // Required:  team_size <= 8
  const int team_size = PoolExec::pool_size(2); // Threads per core
  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
  if ( 8 < team_size ) {
    Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
  }
 #pragma omp parallel
  {
    PoolExec & self = *PoolExec::get_thread_omp();
    Member single_exec ;
    Member team_exec( self , team_size );
    // Team shared memory
    task_root_type * volatile * const task_shared =
      (task_root_type **) team_exec.m_team_exec->scratch_thread();
 // Barrier across entire OpenMP thread pool to insure initialization
 #pragma omp barrier
    // Loop until all queues are empty and no tasks in flight
    do {
      task_root_type * task = 0 ;
      // Each team lead attempts to acquire either a thread team task
      // or a single thread task for the team.
      if ( 0 == team_exec.team_rank() ) {
        task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
        // Loop by priority and then type
        for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
          for ( int j = 0 ; j < 2 && end == task ; ++j ) {
            task = queue_type::pop_task( & queue->m_ready[i][j] );
          }
        }
      }
      // Team lead broadcast acquired task to team members:
      if ( 1 < team_exec.team_size() ) {
        if ( 0 == team_exec.team_rank() ) *task_shared = task ;
        // Fence to be sure task_shared is stored before the barrier
        Kokkos::memory_fence();
        // Whole team waits for every team member to reach this statement
        team_exec.team_barrier();
        // Fence to be sure task_shared is stored
        Kokkos::memory_fence();
        task = *task_shared ;
      }
 #if 0
 fprintf( stdout
       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
       , team_exec.m_group_rank
       , team_exec.m_team_rank
       , uintptr_t(task_shared)
       , uintptr_t(task)
       );
 fflush(stdout);
 #endif
      if ( 0 == task ) break ; // 0 == m_ready_count
      if ( end == task ) {
        // All team members wait for whole team to reach this statement.
        // Is necessary to prevent task_shared from being updated
        // before it is read by all threads.
        team_exec.team_barrier();
      }
      else if ( task_root_type::TaskTeam == task->m_task_type ) {
        // Thread Team Task
        (*task->m_apply)( task , & team_exec );
        // The m_apply function performs a barrier
        if ( 0 == team_exec.team_rank() ) {
          // team member #0 completes the task, which may delete the task
          queue->complete( task ); 
        }
      }
      else {
        // Single Thread Task
        if ( 0 == team_exec.team_rank() ) {
          (*task->m_apply)( task , & single_exec );
          queue->complete( task ); 
        }
        // All team members wait for whole team to reach this statement.
        // Not necessary to complete the task.
        // Is necessary to prevent task_shared from being updated
        // before it is read by all threads.
        team_exec.team_barrier();
      }
    } while(1);
  }
 // END #pragma omp parallel
 }
 void TaskQueueSpecialization< Kokkos::OpenMP >::
  iff_single_thread_recursive_execute
    ( TaskQueue< Kokkos::OpenMP > * const queue )
 {
  using execution_space = Kokkos::OpenMP ;
  using queue_type      = TaskQueue< execution_space > ;
  using task_root_type  = TaskBase< execution_space , void , void > ;
  using Member          = TaskExec< execution_space > ;
  if ( 1 == omp_get_num_threads() ) {
    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
    Member single_exec ;
    task_root_type * task = end ;
    do {
      task = end ;
      // Loop by priority and then type
      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
          task = queue_type::pop_task( & queue->m_ready[i][j] );
        }
      }
      if ( end == task ) break ;
      (*task->m_apply)( task , & single_exec );
      queue->complete( task ); 
    } while(1);
  }
 }
 }} /* namespace Kokkos::Impl */
 //----------------------------------------------------------------------------
 #endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@ -0,0 +1,356 @@
 /*
 //@HEADER
 // ************************************************************************
 // 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 // 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 // 
 // ************************************************************************
 //@HEADER
 */
 #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
 #define KOKKOS_IMPL_OPENMP_TASK_HPP
 #if defined( KOKKOS_ENABLE_TASKPOLICY )
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 namespace Kokkos {
 namespace Impl {
 template<>
 class TaskQueueSpecialization< Kokkos::OpenMP >
 {
 public:
  using execution_space = Kokkos::OpenMP ;
  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
  // Must specify memory space
  using memory_space = Kokkos::HostSpace ;
  static
  void iff_single_thread_recursive_execute( queue_type * const );
  // Must provide task queue execution function
  static void execute( queue_type * const );
  // Must provide mechanism to set function pointer in
  // execution space from the host process.
  template< typename FunctorType >
  static
  void proc_set_apply( task_base_type::function_type * ptr )
    {
      using TaskType = TaskBase< Kokkos::OpenMP
                               , typename FunctorType::value_type
                               , FunctorType
                               > ;
       *ptr = TaskType::apply ;
    }
 };
 extern template class TaskQueue< Kokkos::OpenMP > ;
 //----------------------------------------------------------------------------
 template<>
 class TaskExec< Kokkos::OpenMP >
 {
 private:
  TaskExec( TaskExec && ) = delete ;
  TaskExec( TaskExec const & ) = delete ;
  TaskExec & operator = ( TaskExec && ) = delete ;
  TaskExec & operator = ( TaskExec const & ) = delete ;
  using PoolExec = Kokkos::Impl::OpenMPexec ;
  friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure 
  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
  int64_t          m_sync_mask ;
  int64_t mutable  m_sync_value ;
  int     mutable  m_sync_step ;
  int              m_group_rank ; ///< Which "team" subset of thread pool
  int              m_team_rank ;  ///< Which thread within a team
  int              m_team_size ;
  TaskExec();
  TaskExec( PoolExec & arg_exec , int arg_team_size );
  void team_barrier_impl() const ;
 public:
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
  void * team_shared() const
    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
  int team_shared_size() const
    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
  /**\brief  Whole team enters this function call
   *         before any teeam member returns from
   *         this function call.
   */
  void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
 #else
  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
 #endif
  KOKKOS_INLINE_FUNCTION
  int team_rank() const { return m_team_rank ; }
  KOKKOS_INLINE_FUNCTION
  int team_size() const { return m_team_size ; }
 };
 }} /* namespace Kokkos::Impl */
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 namespace Kokkos {
 template<typename iType>
 KOKKOS_INLINE_FUNCTION
 Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
 TeamThreadRange
  ( Impl::TaskExec< Kokkos::OpenMP > & thread
  , const iType & count )
 {
  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
 }
 template<typename iType>
 KOKKOS_INLINE_FUNCTION
 Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >
 TeamThreadRange
  ( Impl:: TaskExec< Kokkos::OpenMP > & thread
  , const iType & start
  , const iType & end )
 {
  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >(thread,start,end);
 }
 /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
 *
 * The range i=0..N-1 is mapped to all threads of the the calling thread team.
 * This functionality requires C++11 support.
 */
 template<typename iType, class Lambda>
 KOKKOS_INLINE_FUNCTION
 void parallel_for
  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
  , const Lambda& lambda
  )
 {
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i);
  }
 }
 template<typename iType, class Lambda, typename ValueType>
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
  , const Lambda& lambda
  , ValueType& initialized_result)
 {
  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
  ValueType result = initialized_result;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i, result);
  }
  if ( 1 < loop_boundaries.thread.team_size() ) {
    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
    loop_boundaries.thread.team_barrier();
    shared[team_rank] = result;
    loop_boundaries.thread.team_barrier();
    // reduce across threads to thread 0
    if (team_rank == 0) {
      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
        shared[0] += shared[i];
      }
    }
    loop_boundaries.thread.team_barrier();
    // broadcast result
    initialized_result = shared[0];
  }
  else {
    initialized_result = result ;
  }
 }
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
   const Lambda & lambda,
   const JoinType & join,
   ValueType& initialized_result)
 {
  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
  ValueType result = initialized_result;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i, result);
  }
  if ( 1 < loop_boundaries.thread.team_size() ) {
    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
    loop_boundaries.thread.team_barrier();
    shared[team_rank] = result;
    loop_boundaries.thread.team_barrier();
    // reduce across threads to thread 0
    if (team_rank == 0) {
      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
        join(shared[0], shared[i]);
      }
    }
    loop_boundaries.thread.team_barrier();
    // broadcast result
    initialized_result = shared[0];
  }
  else {
    initialized_result = result ;
  }
 }
 // placeholder for future function
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
   const Lambda & lambda,
   ValueType& initialized_result)
 {
 }
 // placeholder for future function
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
   const Lambda & lambda,
   const JoinType & join,
   ValueType& initialized_result)
 {
 }
 template< typename ValueType, typename iType, class Lambda >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
   const Lambda & lambda)
 {
  ValueType accum = 0 ;
  ValueType val, local_total;
  ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
  int team_size = loop_boundaries.thread.team_size();
  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
  // Intra-member scan
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    local_total = 0;
    lambda(i,local_total,false);
    val = accum;
    lambda(i,val,true);
    accum += local_total;
  }
  shared[team_rank] = accum;
  loop_boundaries.thread.team_barrier();
  // Member 0 do scan on accumulated totals
  if (team_rank == 0) {
    for( iType i = 1; i < team_size; i+=1) {
      shared[i] += shared[i-1];
    }
    accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
  }
  loop_boundaries.thread.team_barrier();
  // Inter-member scan adding in accumulated totals
  if (team_rank != 0) { accum = shared[team_rank-1]; }
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    local_total = 0;
    lambda(i,local_total,false);
    val = accum;
    lambda(i,val,true);
    accum += local_total;
  }
 }
 // placeholder for future function
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
   const Lambda & lambda)
 {
 }
 } /* namespace Kokkos */
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 #endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@ -49,6 +49,7 @@
 #include <impl/Kokkos_Error.hpp>
 #include <iostream>
 #include <impl/Kokkos_CPUDiscovery.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #ifdef KOKKOS_HAVE_OPENMP
@ -85,16 +86,8 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
 int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
 #if ! KOKKOS_USING_EXP_VIEW
 OpenMPexec::Pool OpenMPexec::m_pool;
 #else
 OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
 #endif
 void OpenMPexec::verify_is_process( const char * const label )
 {
  if ( omp_in_parallel() ) {
@ -125,16 +118,12 @@ void OpenMPexec::clear_scratch()
 #pragma omp parallel
  {
    const int rank_rev = m_map_rank[ omp_get_thread_num() ];
 #if KOKKOS_USING_EXP_VIEW
    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
    if ( m_pool[ rank_rev ] ) {
      Record * const r = Record::get_record( m_pool[ rank_rev ] );
      m_pool[ rank_rev ] = 0 ;
      Record::decrement( r );
    }
 #else
    m_pool.at(rank_rev).clear();
 #endif
  }
 /* END #pragma omp parallel */
 }
@ -172,8 +161,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
      const int rank_rev = m_map_rank[ omp_get_thread_num() ];
      const int rank     = pool_size - ( rank_rev + 1 );
 #if KOKKOS_USING_EXP_VIEW
      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
      Record * const r = Record::allocate( Kokkos::HostSpace()
@ -184,15 +171,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
      m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
 #else
      #pragma omp critical
      {
        m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
      }
 #endif
      new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
    }
 /* END #pragma omp parallel */
@ -330,6 +308,10 @@ void OpenMP::initialize( unsigned thread_count ,
  }
  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();
  #if (KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
  #endif
 }
 //----------------------------------------------------------------------------
@ -350,6 +332,10 @@ void OpenMP::finalize()
  if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
    hwloc::unbind_this_thread();
  }
  #if (KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
  #endif
 }
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@ -46,7 +46,6 @@
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_spinwait.hpp>
 #include <impl/Kokkos_AllocationTracker.hpp>
 #include <Kokkos_Atomic.hpp>
 #include <iostream>
@ -63,38 +62,10 @@ public:
  enum { MAX_THREAD_COUNT = 4096 };
 #if ! KOKKOS_USING_EXP_VIEW
  struct Pool
  {
    Pool() : m_trackers() {}
    AllocationTracker m_trackers[ MAX_THREAD_COUNT ];
    OpenMPexec * operator[](int i)
    {
      return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr());
    }
    AllocationTracker & at(int i)
    {
      return m_trackers[i];
    }
  };
 private:
  static Pool         m_pool; // Indexed by: m_pool_rank_rev
 #else
 private:
  static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
 #endif
  static int          m_pool_topo[ 4 ];
  static int          m_map_rank[ MAX_THREAD_COUNT ];
@ -145,6 +116,12 @@ public:
  inline long team_work_index() const { return m_team_work_index ; }
  inline int scratch_reduce_size() const
    { return m_scratch_reduce_end - m_scratch_exec_end ; }
  inline int scratch_thread_size() const
    { return m_scratch_thread_end - m_scratch_reduce_end ; }
  inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
  inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
@ -157,15 +134,15 @@ public:
  ~OpenMPexec() {}
-  OpenMPexec( const int poolRank
+  OpenMPexec( const int arg_poolRank
-            , const int scratch_exec_size
+            , const int arg_scratch_exec_size
-            , const int scratch_reduce_size
+            , const int arg_scratch_reduce_size
-            , const int scratch_thread_size )
+            , const int arg_scratch_thread_size )
-    : m_pool_rank( poolRank )
+    : m_pool_rank( arg_poolRank )
-    , m_pool_rank_rev( pool_size() - ( poolRank + 1 ) )
+    , m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
-    , m_scratch_exec_end( scratch_exec_size )
+    , m_scratch_exec_end( arg_scratch_exec_size )
-    , m_scratch_reduce_end( m_scratch_exec_end   + scratch_reduce_size )
+    , m_scratch_reduce_end( m_scratch_exec_end   + arg_scratch_reduce_size )
-    , m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size )
+    , m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
    , m_barrier_state(0)
    {}
@ -330,7 +307,7 @@ public:
  Impl::OpenMPexec    & m_exec ;
  scratch_memory_space  m_team_shared ;
-  int                   m_team_shmem ;
+  int                   m_team_scratch_size[2] ;
  int                   m_team_base_rev ;
  int                   m_team_rank_rev ;
  int                   m_team_rank ;
@ -378,15 +355,15 @@ public:
  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space& team_shmem() const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space& team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space& thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
@ -568,11 +545,12 @@ public:
  inline
  OpenMPexecTeamMember( Impl::OpenMPexec & exec
                      , const TeamPolicyInternal< OpenMP, Properties ...> & team
-                      , const int shmem_size
+                      , const int shmem_size_L1
                      , const int shmem_size_L2
                      )
    : m_exec( exec )
    , m_team_shared(0,0)
-    , m_team_shmem( shmem_size )
+    , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
    , m_team_base_rev(0)
    , m_team_rank_rev(0)
    , m_team_rank(0)
@ -580,7 +558,7 @@ public:
    , m_league_rank(0)
    , m_league_end(0)
    , m_league_size( team.league_size() )
-    , m_chunk_size( team.chunk_size() )
+    , m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
    , m_league_chunk_end(0)
    , m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
    , m_team_alloc( team.team_alloc())
@ -589,10 +567,9 @@ public:
      const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
      const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
      const int pool_num_teams       = OpenMP::thread_pool_size(0)/team.team_alloc();
-      const int chunk_size           = team.chunk_size()>0?team.chunk_size():team.team_iter();
+      const int chunks_per_team      = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
-      const int chunks_per_team      = ( team.league_size() + chunk_size*pool_num_teams-1 ) / (chunk_size*pool_num_teams);
+            int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
-            int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * chunk_size;
+            int league_iter_begin    = league_iter_end - chunks_per_team * m_chunk_size;
            int league_iter_begin    = league_iter_end - chunks_per_team * chunk_size;
      if (league_iter_begin < 0)     league_iter_begin = 0;
      if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
@ -611,7 +588,9 @@ public:
        m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
        m_league_end     = league_iter_end ;
        m_league_rank    = league_iter_begin ;
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
                                               0 );
      }
      if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
@ -627,10 +606,13 @@ public:
  void next_static()
    {
-      if ( ++m_league_rank < m_league_end ) {
+      if ( m_league_rank < m_league_end ) {
        team_barrier();
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
                                               0);
      }
      m_league_rank++;
    }
  bool valid_dynamic() {
@ -661,10 +643,13 @@ public:
    if(m_invalid_thread)
      return;
    if ( m_league_rank < m_league_chunk_end ) {
      team_barrier();
-    if ( ++m_league_rank < m_league_chunk_end ) {
+      new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
-      new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+                                           ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
                                             0);
    }
    m_league_rank++;
  }
  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
@ -687,8 +672,10 @@ public:
    m_team_size = p.m_team_size;
    m_team_alloc = p.m_team_alloc;
    m_team_iter = p.m_team_iter;
-    m_team_scratch_size = p.m_team_scratch_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
-    m_thread_scratch_size = p.m_thread_scratch_size;
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
    m_team_scratch_size[1] = p.m_team_scratch_size[1];
    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
    m_chunk_size = p.m_chunk_size;
    return *this;
  }
@ -719,8 +706,8 @@ private:
  int m_team_alloc ;
  int m_team_iter ;
-  size_t m_team_scratch_size;
+  size_t m_team_scratch_size[2];
-  size_t m_thread_scratch_size;
+  size_t m_thread_scratch_size[2];
  int m_chunk_size;
@ -753,15 +740,19 @@ public:
  inline int team_size()   const { return m_team_size ; }
  inline int league_size() const { return m_league_size ; }
-  inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
    if(team_size_ < 0)
      team_size_ = m_team_size;
    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
  }
  /** \brief  Specify league size, request team size */
  TeamPolicyInternal( typename traits::execution_space &
            , int league_size_request
            , int team_size_request
            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size ( 0 )
+            , m_thread_scratch_size { 0 , 0 }
            , m_chunk_size(0)
    { init( league_size_request , team_size_request ); }
@ -769,24 +760,24 @@ public:
            , int league_size_request
            , const Kokkos::AUTO_t & /* team_size_request */
            , int /* vector_length_request */ = 1)
-            : m_team_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size ( 0 )
+            , m_thread_scratch_size { 0 , 0 }
            , m_chunk_size(0)
    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
  TeamPolicyInternal( int league_size_request
            , int team_size_request
            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size ( 0 )
+            , m_thread_scratch_size { 0 , 0 }
            , m_chunk_size(0)
    { init( league_size_request , team_size_request ); }
  TeamPolicyInternal( int league_size_request
            , const Kokkos::AUTO_t & /* team_size_request */
            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size ( 0 )
+            , m_thread_scratch_size { 0 , 0 }
            , m_chunk_size(0)
    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
@ -803,24 +794,21 @@ public:
  }
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
    return p;
  };
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
@ -104,7 +104,7 @@ namespace Kokkos {
 int Qthread::is_initialized()
 {
-  Impl::s_number_workers != 0 ;
+  return Impl::s_number_workers != 0 ;
 }
 int Qthread::concurrency()
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
@ -145,11 +145,13 @@ public:
  //----------------------------------------
  /** Reduce across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ArgTag >
+  template< class FunctorType , class ReducerType , class ArgTag >
  inline
-  void exec_all_reduce( const FunctorType & func ) const
+  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
    {
-      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
+      typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
      typedef typename ReducerConditional::type ReducerTypeFwd;
      typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
@ -160,7 +162,7 @@ public:
        Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
-        ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc );
+        ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
      }
      if ( rev_rank ) {
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
@ -130,9 +130,10 @@ public:
 //----------------------------------------------------------------------------
-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
                    , Kokkos::RangePolicy< Traits ... >
                    , ReducerType
                    , Kokkos::Qthread
                    >
 {
@ -141,17 +142,23 @@ private:
  typedef Kokkos::RangePolicy< Traits ... >  Policy ;
  typedef typename Policy::work_tag     WorkTag ;
  typedef typename Policy::member_type  Member ;
  typedef typename Policy::WorkRange    WorkRange ;
  typedef typename Policy::member_type  Member ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
  // Static Assert WorkTag void if ReducerType not InvalidType
  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;
  const FunctorType   m_functor ;
  const Policy        m_policy ;
  const ReducerType   m_reducer ;
  const pointer_type  m_result_ptr ;
  template< class TagType >
@ -187,9 +194,10 @@ private:
    ParallelReduce::template exec_range< WorkTag >(
      self.m_functor, range.begin(), range.end(),
-      ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
+      ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer)
                     , exec.exec_all_reduce_value() ) );
-    exec.template exec_all_reduce<FunctorType, WorkTag >( self.m_functor );
+    exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
  }
 public:
@ -197,26 +205,39 @@ public:
  inline
  void execute() const
    {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
-      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );
      if ( m_result_ptr ) {
-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
      }
    }
-  template< class HostViewType >
+  template< class ViewType >
  ParallelReduce( const FunctorType  & arg_functor
                , const Policy       & arg_policy
-                , const HostViewType & arg_result_view )
+                , const ViewType & arg_result_view
                , typename std::enable_if<Kokkos::is_view< ViewType >::value &&
                                          !Kokkos::is_reducer_type< ReducerType >::value
                                          , void*>::type = NULL)
    : m_functor( arg_functor )
    , m_policy( arg_policy )
-    , m_result_ptr( arg_result_view.ptr_on_device() )
+    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result_view.data() )
    { }
  ParallelReduce( const FunctorType & arg_functor
                , Policy       arg_policy
                , const ReducerType& reducer )
    : m_functor( arg_functor )
    , m_policy( arg_policy )
    , m_reducer( reducer )
    , m_result_ptr( reducer.result_view().data() )
    { }
 };
@ -291,10 +312,12 @@ public:
 //----------------------------------------------------------------------------
-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType , class ... Properties >
 class ParallelReduce< FunctorType
                    , TeamPolicy< Properties... >
-                    , Kokkos::Qthread >
+                    , ReducerType
                    , Kokkos::Qthread
                    >
 {
 private:
@ -303,14 +326,18 @@ private:
  typedef typename Policy::work_tag     WorkTag ;
  typedef typename Policy::member_type  Member ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;
  const FunctorType  m_functor ;
  const Policy       m_policy ;
  const ReducerType  m_reducer ;
  const pointer_type m_result_ptr ;
  template< class TagType >
@ -345,9 +372,10 @@ private:
    ParallelReduce::template exec_team< WorkTag >
      ( self.m_functor
      , Member( exec , self.m_policy )
-      , ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
+      , ValueInit::init( ReducerConditional::select( self.m_functor , self.m_reducer )
                       , exec.exec_all_reduce_value() ) );
-    exec.template exec_all_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
  }
 public:
@ -356,29 +384,43 @@ public:
  void execute() const
    {
      QthreadExec::resize_worker_scratch
-        ( /* reduction   memory */ ValueTraits::value_size( m_functor )
+        ( /* reduction   memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
-      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );
      if ( m_result_ptr ) {
-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
      }
    }
  template< class ViewType >
-  ParallelReduce( const FunctorType & arg_functor ,
+  ParallelReduce( const FunctorType & arg_functor
-                  const Policy      & arg_policy ,
+                , const Policy      & arg_policy
-                  const ViewType    & arg_result )
+                , const ViewType    & arg_result
                , typename std::enable_if<Kokkos::is_view< ViewType >::value &&
                                          !Kokkos::is_reducer_type< ReducerType >::value
                                          , void*>::type = NULL)
    : m_functor( arg_functor )
    , m_policy( arg_policy )
    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result.ptr_on_device() )
    { }
  inline
  ParallelReduce( const FunctorType & arg_functor
                , Policy       arg_policy
                , const ReducerType& reducer )
  : m_functor( arg_functor )
  , m_policy( arg_policy )
  , m_reducer( reducer )
  , m_result_ptr( reducer.result_view().data() )
  { }
 };
 //----------------------------------------------------------------------------
@ -395,8 +437,8 @@ private:
  typedef Kokkos::RangePolicy< Traits ... >  Policy ;
  typedef typename Policy::work_tag     WorkTag ;
  typedef typename Policy::member_type  Member ;
  typedef typename Policy::WorkRange    WorkRange ;
  typedef typename Policy::member_type  Member ;
  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
@ -58,6 +58,8 @@
 #include <Kokkos_Atomic.hpp>
 #include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
 #if defined( KOKKOS_ENABLE_TASKPOLICY )
 //----------------------------------------------------------------------------
 namespace Kokkos {
@ -122,8 +124,8 @@ Task::~TaskMember()
 Task::TaskMember( const function_verify_type   arg_verify
                , const function_dealloc_type  arg_dealloc
-                , const function_apply_single_type  arg_apply_single
+                , const function_single_type   arg_apply_single
-                , const function_apply_team_type    arg_apply_team
+                , const function_team_type     arg_apply_team
                , volatile int &               arg_active_count
                , const unsigned               arg_sizeof_derived
                , const unsigned               arg_dependence_capacity
@ -145,8 +147,8 @@ Task::TaskMember( const function_verify_type        arg_verify
 }
 Task::TaskMember( const function_dealloc_type  arg_dealloc
-                , const function_apply_single_type  arg_apply_single
+                , const function_single_type   arg_apply_single
-                , const function_apply_team_type    arg_apply_team
+                , const function_team_type     arg_apply_team
                , volatile int &               arg_active_count
                , const unsigned               arg_sizeof_derived
                , const unsigned               arg_dependence_capacity
@ -316,12 +318,8 @@ aligned_t Task::qthread_func( void * arg )
                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
                                        );
  // It is a single thread's responsibility to close out
  // this task's execution.
  bool close_out = false ;
  if ( task->m_apply_team && ! task->m_apply_single ) {
-    const Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
+    Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
    // Initialize team size and rank with shephered info
    Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
@ -344,7 +342,7 @@ fflush(stdout);
    if ( member.team_rank() == 0 ) task->closeout();
    member.team_barrier();
  }
-  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_apply_single_type>(1) ) {
+  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
    // Team hard-wired to one, no cloning
    Kokkos::Impl::QthreadTeamPolicyMember member ;
    (*task->m_apply_team)( task , member );
@ -488,5 +486,6 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
 } // namespace Experimental
 } // namespace Kokkos
 #endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
@ -69,6 +69,8 @@
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #if defined( KOKKOS_ENABLE_TASKPOLICY )
 //----------------------------------------------------------------------------
 namespace Kokkos {
@ -80,17 +82,17 @@ class TaskMember< Kokkos::Qthread , void , void >
 {
 public:
  typedef void         (* function_apply_single_type) ( TaskMember * );
  typedef void         (* function_apply_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
  typedef void         (* function_dealloc_type)( TaskMember * );
  typedef TaskMember * (* function_verify_type) ( TaskMember * );
  typedef void         (* function_single_type) ( TaskMember * );
  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
  typedef void         (* function_dealloc_type)( TaskMember * );
 private:
  const function_dealloc_type  m_dealloc ;       ///< Deallocation
  const function_verify_type   m_verify ;        ///< Result type verification
-  const function_apply_single_type  m_apply_single ;  ///< Apply function
+  const function_single_type   m_apply_single ;  ///< Apply function
-  const function_apply_team_type    m_apply_team ;    ///< Apply function
+  const function_team_type     m_apply_team ;    ///< Apply function
  int volatile * const         m_active_count ;  ///< Count of active tasks on this policy
  aligned_t                    m_qfeb ;          ///< Qthread full/empty bit
  TaskMember ** const          m_dep ;           ///< Dependences
@ -130,8 +132,8 @@ protected :
  // Used by TaskMember< Qthread , ResultType , void >
  TaskMember( const function_verify_type   arg_verify
            , const function_dealloc_type  arg_dealloc
-            , const function_apply_single_type  arg_apply_single
+            , const function_single_type   arg_apply_single
-            , const function_apply_team_type    arg_apply_team
+            , const function_team_type     arg_apply_team
            , volatile int &               arg_active_count
            , const unsigned               arg_sizeof_derived
            , const unsigned               arg_dependence_capacity
@ -139,8 +141,8 @@ protected :
  // Used for TaskMember< Qthread , void , void >
  TaskMember( const function_dealloc_type  arg_dealloc
-            , const function_apply_single_type  arg_apply_single
+            , const function_single_type   arg_apply_single
-            , const function_apply_team_type    arg_apply_team
+            , const function_team_type     arg_apply_team
            , volatile int &               arg_active_count
            , const unsigned               arg_sizeof_derived
            , const unsigned               arg_dependence_capacity
@ -221,7 +223,7 @@ public:
      typedef typename DerivedTaskType::functor_type  functor_type ;
      typedef typename functor_type::value_type       value_type ;
-      const function_apply_single_type flag = reinterpret_cast<function_apply_single_type>( arg_is_team ? 0 : 1 );
+      const function_single_type flag = reinterpret_cast<function_single_type>( arg_is_team ? 0 : 1 );
      DerivedTaskType * const task =
        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
@ -379,13 +381,13 @@ protected:
  typedef TaskMember< Kokkos::Qthread , void , void >  task_root_type ;
  typedef task_root_type::function_dealloc_type        function_dealloc_type ;
-  typedef task_root_type::function_apply_single_type   function_apply_single_type ;
+  typedef task_root_type::function_single_type         function_single_type ;
-  typedef task_root_type::function_apply_team_type     function_apply_team_type ;
+  typedef task_root_type::function_team_type           function_team_type ;
  inline
  TaskMember( const function_dealloc_type  arg_dealloc
-            , const function_apply_single_type  arg_apply_single
+            , const function_single_type   arg_apply_single
-            , const function_apply_team_type    arg_apply_team
+            , const function_team_type     arg_apply_team
            , volatile int &               arg_active_count
            , const unsigned               arg_sizeof_derived
            , const unsigned               arg_dependence_capacity
@ -413,13 +415,13 @@ public:
  typedef TaskMember< Kokkos::Qthread , void , void >        task_root_type ;
  typedef TaskMember< Kokkos::Qthread , ResultType , void >  task_base_type ;
  typedef task_root_type::function_dealloc_type              function_dealloc_type ;
-  typedef task_root_type::function_apply_single_type         function_apply_single_type ;
+  typedef task_root_type::function_single_type               function_single_type ;
-  typedef task_root_type::function_apply_team_type           function_apply_team_type ;
+  typedef task_root_type::function_team_type                 function_team_type ;
  inline
  TaskMember( const function_dealloc_type  arg_dealloc
-            , const function_apply_single_type  arg_apply_single
+            , const function_single_type   arg_apply_single
-            , const function_apply_team_type    arg_apply_team
+            , const function_team_type     arg_apply_team
            , volatile int &               arg_active_count
            , const unsigned               arg_sizeof_derived
            , const unsigned               arg_dependence_capacity
@ -453,6 +455,7 @@ class TaskPolicy< Kokkos::Qthread >
 public:
  typedef Kokkos::Qthread                        execution_space ;
  typedef TaskPolicy                             execution_policy ;
  typedef Kokkos::Impl::QthreadTeamPolicyMember  member_type ;
 private:
@ -489,14 +492,17 @@ public:
    , const unsigned arg_task_team_size = 0 /* choose default */
    );
-  TaskPolicy() = default ;
+  KOKKOS_FUNCTION TaskPolicy() = default ;
-  TaskPolicy( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
-  TaskPolicy( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
-  TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
-  TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
  //----------------------------------------
  KOKKOS_INLINE_FUNCTION
  int allocated_task_count() const { return m_active_count ; }
  template< class ValueType >
  const Future< ValueType , execution_space > &
    spawn( const Future< ValueType , execution_space > & f 
@ -653,5 +659,6 @@ public:
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 #endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #define KOKKOS_QTHREAD_TASK_HPP */
--- a/lib/kokkos/core/src/Qthread/README
+++ b/lib/kokkos/core/src/Qthread/README
@ -3,26 +3,23 @@
 # Cloning repository and branch:
-git clone https://github.com/stelleg/qthreads qthreads-with-clone
+git clone git@github.com:Qthreads/qthreads.git qthreads
-cd qthreads-with-clone
+cd qthreads
-# Added to ./git/config
+# checkout branch with "cloned tasks"
 #
 # [branch "cloned_tasks"]
 #        remote = origin
 #        merge = refs/heads/cloned_tasks
 #
-git branch cloned_tasks
+git checkout dev-kokkos
-git checkout cloned_tasks
+
-git pull
+# Configure/autogen
 sh autogen.sh
-# configurure with 'hwloc' installation:
+# configure with 'hwloc' installation:
 ./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}
 # install
 make install
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@ -53,6 +53,7 @@
 #include <Kokkos_Core.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_CPUDiscovery.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 //----------------------------------------------------------------------------
@ -134,11 +135,7 @@ void ThreadsExec::driver(void)
 ThreadsExec::ThreadsExec()
  : m_pool_base(0)
 #if ! KOKKOS_USING_EXP_VIEW
  , m_scratch()
 #else
  , m_scratch(0)
 #endif
  , m_scratch_reduce_end(0)
  , m_scratch_thread_end(0)
  , m_numa_rank(0)
@ -198,8 +195,6 @@ ThreadsExec::~ThreadsExec()
 {
  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
 #if KOKKOS_USING_EXP_VIEW
  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
  if ( m_scratch ) {
@ -210,12 +205,6 @@ ThreadsExec::~ThreadsExec()
    Record::decrement( r );
  }
 #else
  m_scratch.clear();
 #endif
  m_pool_base   = 0 ;
  m_scratch_reduce_end = 0 ;
  m_scratch_thread_end = 0 ;
@ -439,8 +428,6 @@ void * ThreadsExec::root_reduce_scratch()
 void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
 {
 #if KOKKOS_USING_EXP_VIEW
  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
  if ( exec.m_scratch ) {
@ -451,19 +438,11 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
    Record::decrement( r );
  }
 #else
  exec.m_scratch.clear();
 #endif
  exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
  exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
  if ( s_threads_process.m_scratch_thread_end ) {
 #if KOKKOS_USING_EXP_VIEW
    // Allocate tracked memory:
    {
      Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end );
@ -475,15 +454,6 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
    unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch );
 #else
    exec.m_scratch =
      HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
    unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
 #endif
    unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
    // touch on this thread
@ -520,11 +490,7 @@ void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
    s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
  }
 #if KOKKOS_USING_EXP_VIEW
  return s_threads_process.m_scratch ;
 #else
  return s_threads_process.m_scratch.alloc_ptr() ;
 #endif
 }
 //----------------------------------------------------------------------------
@ -758,6 +724,9 @@ void ThreadsExec::initialize( unsigned thread_count ,
  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();
  #if (KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
  #endif
 }
 //----------------------------------------------------------------------------
@ -807,6 +776,10 @@ void ThreadsExec::finalize()
  s_threads_process.m_pool_size       = 1 ;
  s_threads_process.m_pool_fan_size   = 0 ;
  s_threads_process.m_pool_state = ThreadsExec::Inactive ;
  #if (KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
  #endif
 }
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@ -49,7 +49,6 @@
 #include <utility>
 #include <impl/Kokkos_spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_AllocationTracker.hpp>
 #include <Kokkos_Atomic.hpp>
@ -89,11 +88,7 @@ private:
  ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
 #if ! KOKKOS_USING_EXP_VIEW
  Impl::AllocationTracker m_scratch ;
 #else
  void *        m_scratch ;
 #endif
  int           m_scratch_reduce_end ;
  int           m_scratch_thread_end ;
  int           m_numa_rank ;
@ -138,19 +133,10 @@ public:
  static int get_thread_count();
  static ThreadsExec * get_thread( const int init_thread_rank );
 #if ! KOKKOS_USING_EXP_VIEW
  inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
  KOKKOS_INLINE_FUNCTION  void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
 #else
  inline void * reduce_memory() const { return m_scratch ; }
  KOKKOS_INLINE_FUNCTION  void * scratch_memory() const
    { return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; }
 #endif
  KOKKOS_INLINE_FUNCTION  int volatile & state() { return m_pool_state ; }
  KOKKOS_INLINE_FUNCTION  ThreadsExec * const * pool_base() const { return m_pool_base ; }
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@ -129,15 +129,15 @@ public:
  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space & team_shmem() const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space & team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space & thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
@ -433,10 +433,11 @@ public:
  void next_static()
    {
-      if ( ++m_league_rank < m_league_end ) {
+      if ( m_league_rank < m_league_end ) {
        team_barrier();
        set_team_shared();
      }
      m_league_rank++;
    }
  bool valid_dynamic() {
@ -468,10 +469,11 @@ public:
    if(m_invalid_thread)
      return;
    if ( m_league_rank < m_league_chunk_end ) {
      team_barrier();
    if ( ++m_league_rank < m_league_chunk_end ) {
      set_team_shared();
    }
    m_league_rank++;
  }
  void set_league_shmem( const int arg_league_rank
@ -504,8 +506,8 @@ private:
  int m_team_alloc ;
  int m_team_iter ;
-  size_t m_team_scratch_size;
+  size_t m_team_scratch_size[2];
-  size_t m_thread_scratch_size;
+  size_t m_thread_scratch_size[2];
  int m_chunk_size;
@ -549,8 +551,10 @@ public:
    m_team_size = p.m_team_size;
    m_team_alloc = p.m_team_alloc;
    m_team_iter = p.m_team_iter;
-    m_team_scratch_size = p.m_team_scratch_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
-    m_thread_scratch_size = p.m_thread_scratch_size;
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
    m_team_scratch_size[1] = p.m_team_scratch_size[1];
    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
    m_chunk_size = p.m_chunk_size;
    return *this;
  }
@ -577,7 +581,12 @@ public:
  inline int team_size() const { return m_team_size ; }
  inline int team_alloc() const { return m_team_alloc ; }
  inline int league_size() const { return m_league_size ; }
-  inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1 ) const {
    if(team_size_ < 0)
      team_size_ = m_team_size;
    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
  }
  inline int team_iter() const { return m_team_iter ; }
  /** \brief  Specify league size, request team size */
@ -588,8 +597,8 @@ public:
    : m_league_size(0)
    , m_team_size(0)
    , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
-    , m_thread_scratch_size ( 0 )
+    , m_thread_scratch_size { 0 , 0 }
    , m_chunk_size(0)
    { init(league_size_request,team_size_request); (void) vector_length_request; }
@ -601,8 +610,8 @@ public:
    : m_league_size(0)
    , m_team_size(0)
    , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
-    , m_thread_scratch_size ( 0 )
+    , m_thread_scratch_size { 0 , 0 }
    , m_chunk_size(0)
    { init(league_size_request,traits::execution_space::thread_pool_size(2)); }
@ -612,8 +621,8 @@ public:
    : m_league_size(0)
    , m_team_size(0)
    , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
-    , m_thread_scratch_size ( 0 )
+    , m_thread_scratch_size { 0 , 0 }
    , m_chunk_size(0)
    { init(league_size_request,team_size_request); }
@ -623,8 +632,8 @@ public:
    : m_league_size(0)
    , m_team_size(0)
    , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
-    , m_thread_scratch_size ( 0 )
+    , m_thread_scratch_size { 0 , 0 }
    , m_chunk_size(0)
    { init(league_size_request,traits::execution_space::thread_pool_size(2)); }
@ -639,26 +648,23 @@ public:
  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
    return p;
  };
  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };
  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@ -264,7 +264,7 @@ public:
             , const Policy      & arg_policy )
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
    { }
 };
@ -272,9 +272,10 @@ public:
 //----------------------------------------------------------------------------
 /* ParallelReduce with Kokkos::Threads and RangePolicy */
-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
                    , Kokkos::RangePolicy< Traits ... >
                    , ReducerType
                    , Kokkos::Threads
                    >
 {
@ -286,14 +287,18 @@ private:
  typedef typename Policy::WorkRange   WorkRange ;
  typedef typename Policy::member_type Member ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;
  const FunctorType  m_functor ;
  const Policy       m_policy ;
  const ReducerType   m_reducer ;
  const pointer_type m_result_ptr ;
  template< class TagType >
@ -344,9 +349,9 @@ private:
    ParallelReduce::template exec_range< WorkTag >
      ( self.m_functor , range.begin() , range.end() 
-      , ValueInit::init( self.m_functor , exec.reduce_memory() ) );
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
-    exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
  }
  template<class Schedule>
@ -362,7 +367,7 @@ private:
    exec.barrier();
    long work_index = exec.get_work_index();
-    reference_type update = ValueInit::init( self.m_functor , exec.reduce_memory() );
+    reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
    while(work_index != -1) {
      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
@ -372,7 +377,7 @@ private:
      work_index = exec.get_work_index();
    }
-    exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
  }
 public:
@ -380,7 +385,7 @@ public:
  inline
  void execute() const
    {
-      ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
      ThreadsExec::start( & ParallelReduce::exec , this );
@ -391,7 +396,7 @@ public:
        const pointer_type data =
          (pointer_type) ThreadsExec::root_reduce_scratch();
-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
      }
    }
@ -399,9 +404,14 @@ public:
  template< class HostViewType >
  ParallelReduce( const FunctorType  & arg_functor ,
                  const Policy       & arg_policy ,
-                  const HostViewType & arg_result_view )
+                  const HostViewType & arg_result_view ,
                  typename std::enable_if<
                               Kokkos::is_view< HostViewType >::value &&
                              !Kokkos::is_reducer_type<ReducerType>::value
                  ,void*>::type = NULL)
    : m_functor( arg_functor )
    , m_policy( arg_policy )
    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result_view.ptr_on_device() )
    {
      static_assert( Kokkos::is_view< HostViewType >::value
@ -410,14 +420,30 @@ public:
      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
        , "Kokkos::Threads reduce result must be a View in HostSpace" );
    }
  inline
  ParallelReduce( const FunctorType & arg_functor
                , Policy       arg_policy
                , const ReducerType& reducer )
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( reducer )
    , m_result_ptr(  reducer.result_view().data() )
    {
      /*static_assert( std::is_same< typename ViewType::memory_space
                                      , Kokkos::HostSpace >::value
        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
    }
 };
 //----------------------------------------------------------------------------
 /* ParallelReduce with Kokkos::Threads and TeamPolicy */
-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType, class ... Properties >
 class ParallelReduce< FunctorType
                    , Kokkos::TeamPolicy< Properties ... >
                    , ReducerType
                    , Kokkos::Threads
                    >
 {
@ -426,14 +452,19 @@ private:
  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... >              Policy ;
  typedef typename Policy::work_tag                                WorkTag ;
  typedef typename Policy::member_type                             Member ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
  typedef typename ReducerConditional::type ReducerTypeFwd;
  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;
  const FunctorType  m_functor ;
  const Policy       m_policy ;
  const ReducerType  m_reducer ;
  const pointer_type m_result_ptr ;
  const int          m_shared ;
@ -464,9 +495,9 @@ private:
    ParallelReduce::template exec_team< WorkTag >
      ( self.m_functor , Member( & exec , self.m_policy , self.m_shared )
-      , ValueInit::init( self.m_functor , exec.reduce_memory() ) );
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
-    exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
  }
 public:
@ -474,7 +505,7 @@ public:
  inline
  void execute() const
    {
-      ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , Policy::member_type::team_reduce_size() + m_shared );
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , Policy::member_type::team_reduce_size() + m_shared );
      ThreadsExec::start( & ParallelReduce::exec , this );
@ -484,20 +515,41 @@ public:
        const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
      }
    }
  template< class ViewType >
-  ParallelReduce( const FunctorType & arg_functor
+  inline
-                , const Policy      & arg_policy
+  ParallelReduce( const FunctorType  & arg_functor ,
-                , const ViewType    & arg_result )
+                  const Policy       & arg_policy ,
                  const ViewType     & arg_result ,
                  typename std::enable_if<
                    Kokkos::is_view< ViewType >::value &&
                    !Kokkos::is_reducer_type<ReducerType>::value
                    ,void*>::type = NULL)
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
-    { }
+    {}
  inline
  ParallelReduce( const FunctorType & arg_functor
    , Policy       arg_policy
    , const ReducerType& reducer )
  : m_functor( arg_functor )
  , m_policy(  arg_policy )
  , m_reducer( reducer )
  , m_result_ptr(  reducer.result_view().data() )
  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
  {
  /*static_assert( std::is_same< typename ViewType::memory_space
                          , Kokkos::HostSpace >::value
  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
  }
 };
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
@ -46,9 +46,10 @@
 #include <stdio.h>
 #include <iostream>
 #include <sstream>
 #include <Kokkos_Core.hpp>
 #include <Threads/Kokkos_Threads_TaskPolicy.hpp>
-#if defined( KOKKOS_HAVE_PTHREAD )
+#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
 #define QLOCK   (reinterpret_cast<void*>( ~((uintptr_t)0) ))
 #define QDENIED (reinterpret_cast<void*>( ~((uintptr_t)0) - 1 ))
@ -87,9 +88,8 @@ ThreadsTaskPolicyQueue::ThreadsTaskPolicyQueue
  , const unsigned arg_task_team_size
  )
  : m_space( Kokkos::Threads::memory_space()
-           , arg_task_max_size
+           , arg_task_max_size * arg_task_max_count * 1.2
-           , arg_task_max_size * arg_task_max_count
+           , 16 /* log2(superblock size) */
           , 1 /* only one level of memory pool */
           )
  , m_team { 0 , 0 , 0 }
  , m_serial { 0 , 0 , 0 }
@ -624,10 +624,10 @@ ThreadsTaskPolicyQueue::allocate_task
  // User created task memory pool with an estimate,
  // if estimate is to low then report and throw exception.
-  if ( m_space.get_min_chunk_size() < size_alloc ) {
+  if ( m_space.get_min_block_size() < size_alloc ) {
    fprintf(stderr,"TaskPolicy<Threads> task allocation requires %d bytes on memory pool with %d byte chunk size\n"
           , int(size_alloc)
-           , int(m_space.get_min_chunk_size())
+           , int(m_space.get_min_block_size())
           );
    fflush(stderr);
    Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::task_allocate");
@ -926,5 +926,5 @@ void Task::clear_dependence()
 } /* namespace Experimental */
 } /* namespace Kokkos */
-#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
@ -50,7 +50,7 @@
 #include <Kokkos_Threads.hpp>
 #include <Kokkos_TaskPolicy.hpp>
-#if defined( KOKKOS_HAVE_PTHREAD )
+#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
 //----------------------------------------------------------------------------
@ -737,10 +737,9 @@ public:
 } /* namespace Experimental */
 } /* namespace Kokkos */
 #endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
 //----------------------------------------------------------------------------
 #endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */
--- a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
@ -246,8 +246,8 @@ private:
  enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul };
  // The allocation record resides in Host memory space
  Record  * m_record ;
  uintptr_t m_record_bits ;
  Record  * m_record ;
 public:
--- a/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp
@ -47,8 +47,6 @@
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 #if KOKKOS_USING_EXP_VIEW
 namespace Kokkos {
 /* For backward compatibility */
@ -68,8 +66,6 @@ struct ViewAllocateWithoutInitializing {
 } /* namespace Kokkos */
 #endif
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
@ -2604,19 +2604,25 @@ class ViewMapping< DstTraits , SrcTraits ,
    &&
    std::is_same< typename DstTraits::specialize , void >::value
    &&
    std::is_same< typename SrcTraits::specialize , void >::value
    &&
    (
      std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
      ||
      (
        (
          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
        )
        &&
    std::is_same< typename SrcTraits::specialize , void >::value
    &&
        (
          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
        )
      )
    )
  )>::type >
 {
 private:
--- a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
@ -1,848 +0,0 @@
 /*
 //@HEADER
 // ************************************************************************
 // 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 // 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 // 
 // ************************************************************************
 //@HEADER
 */
 #include <Kokkos_Core_fwd.hpp>
 #if ! KOKKOS_USING_EXP_VIEW
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
 #include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_Singleton.hpp>
 #include <impl/Kokkos_AllocationTracker.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <string>
 #include <vector>
 #include <sstream>
 #include <algorithm>
 #include <utility>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <iomanip>
 /* Enable clean up of memory leaks */
 #define CLEAN_UP_MEMORY_LEAKS 0
 namespace Kokkos { namespace Impl {
 namespace {
 //-----------------------------------------------------------------------------
 // AllocationRecord
 //-----------------------------------------------------------------------------
 //
 // Used to track details about an allocation and provide a ref count
 // sizeof(AllocationRecord) == 128
 struct AllocationRecord
 {
  enum {
     OFFSET = sizeof(AllocatorBase*)          // allocator
            + sizeof(void*)                   // alloc_ptr
            + sizeof(uint64_t)                // alloc_size
            + sizeof(AllocatorAttributeBase*) // attribute
            + sizeof(uint32_t)                // node_index
            + sizeof(uint32_t)                // ref_count
   , LABEL_LENGTH = 128 - OFFSET
  };
  AllocatorBase * const          allocator;
  void * const                   alloc_ptr;
  const uint64_t                 alloc_size;
  AllocatorAttributeBase * const attribute;
  const int32_t                  node_index;
  volatile uint32_t              ref_count;
  const char                     label[LABEL_LENGTH];
  AllocationRecord(  AllocatorBase * const arg_allocator
                   , void *   arg_alloc_ptr
                   , uint64_t arg_alloc_size
                   , int32_t  arg_node_index
                   , const std::string & arg_label
                  )
    : allocator(arg_allocator)
    , alloc_ptr(arg_alloc_ptr)
    , alloc_size(arg_alloc_size)
    , attribute(NULL)
    , node_index(arg_node_index)
    , ref_count(1)
    , label() // zero fill
  {
    const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size();
    strncpy( const_cast<char *>(label), arg_label.c_str(), length );
  }
  ~AllocationRecord()
  {
    if (attribute) {
      delete attribute;
    }
  }
  uint32_t increment_ref_count()
  {
    uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) );
    return old_value + 1u;
  }
  uint32_t decrement_ref_count()
  {
    uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) );
    return old_value - 1u;
  }
  void print( std::ostream & oss ) const
  {
    oss << "{ " << allocator->name()
        << " } : \"" << label
        << "\" ref_count(" << ref_count
        << ") memory[ " << alloc_ptr
        << " + " << alloc_size
        << " ]" ;
  }
  bool set_attribute( AllocatorAttributeBase * attr )
  {
    bool result = false;
    if (attribute == NULL) {
      result = NULL == atomic_compare_exchange(  const_cast<AllocatorAttributeBase **>(&attribute)
                                               , reinterpret_cast<AllocatorAttributeBase *>(NULL)
                                               , attr );
    }
    return result;
  }
  // disallow copy and assignment
  AllocationRecord( const AllocationRecord & );
  AllocationRecord & operator=(const AllocationRecord &);
 };
 template <int NumBlocks>
 struct Bitset
 {
  enum { blocks = NumBlocks };
  enum { size = blocks * 64 };
  enum { block_mask = 63u };
  enum { block_shift = 6 };
  // used to find free bits in a bitset
  static int count_trailing_zeros(uint64_t x)
  {
    #if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC )
      return x ? __builtin_ctzll(x) : 64;
    #elif defined( KOKKOS_COMPILER_INTEL )
      enum { shift = 32 };
      enum { mask = (static_cast<uint64_t>(1) << shift) - 1u };
      return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) :
             (x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) :
             64 ;
    #elif defined( KOKKOS_COMPILER_IBM )
      return x ? __cnttz8(x) : 64;
    #else
      int i = 0;
      for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {}
      return i;
    #endif
  }
  Bitset()
    : m_bits()
  {
    for (int i=0; i < blocks; ++i) {
      m_bits[i] = 0u;
    }
  }
  bool set( int i )
  {
    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
    return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit );
  }
  bool reset( int i )
  {
    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
    return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit;
  }
  bool test( int i )
  {
    const uint64_t block = m_bits[ i >> block_shift ];
    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
    return block & bit;
  }
  int find_first_unset() const
  {
    for (int i=0; i < blocks; ++i) {
      const uint64_t block = m_bits[i];
      int b = count_trailing_zeros( ~block );
      if ( b < 64 ) {
        return (i << block_shift) + b;
      }
    }
    return size;
  }
  volatile uint64_t m_bits[blocks];
 };
 //-----------------------------------------------------------------------------
 // AllocationRecordPool -- singleton class
 //
 // global_alloc_rec_pool is the ONLY instance of this class
 //
 //-----------------------------------------------------------------------------
 // Record AllocationRecords in a lock-free circular list.
 // Each node in the list has a buffer with space for 959 ((15*64)-1) records
 // managed by a bitset.  Atomics are used to set and reset bits in the bit set.
 // The head of the list is atomically updated to the last node found with
 // unused space.
 //
 // Cost time to create an allocation record: amortized O(1), worst case O(num nodes)
 // Cost to destroy an allocation recored: O(1)
 //
 // Singleton allocations are pushed onto a lock-free stack that is destroyed
 // after the circular list of allocation records.
 struct AllocationRecordPool
 {
  enum { BITSET_BLOCKS = 15 };
  typedef Bitset<BITSET_BLOCKS> bitset_type;
  enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) };
  struct AllocationNode
  {
    AllocationNode()
      : next()
      , bitset()
      , buffer()
    {
      // set the first bit to used
      bitset.set(0);
    }
    void * get_buffer( int32_t node_index )
    {
      return buffer + (node_index-1) * sizeof(AllocationRecord);
    }
    // return 0 if no space is available in the node
    int32_t get_node_index()
    {
      int32_t node_index = 0;
      do {
        node_index = bitset.find_first_unset();
        // successfully claimed a bit
        if ( node_index != bitset.size && bitset.set(node_index) )
        {
          return node_index;
        }
      } while ( node_index != bitset.size );
      return 0;
    }
    void clear_node_index( int32_t node_index )
    {
      bitset.reset(node_index);
    }
    AllocationNode * next;
    bitset_type      bitset;
    char             buffer[BUFFER_SIZE];
  };
  struct SingletonNode
  {
    void * buffer;
    SingletonNode * next;
    Impl::singleton_destroy_function_type destroy;
    SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func  )
      : buffer(NULL)
      , next(NULL)
      , destroy(destroy_func)
    {
      if (size) {
        buffer = malloc(size);
        create_func(buffer);
      }
    }
    ~SingletonNode()
    {
      if (buffer) {
        try {
          destroy(buffer);
        } catch(...) {}
        free(buffer);
      }
    }
  };
  AllocationRecordPool()
    : head( new AllocationNode() )
    , singleton_head(NULL)
  {
    // setup ring
    head->next = head;
  }
  ~AllocationRecordPool()
  {
    // delete allocation records
    {
      AllocationNode * start = head;
      AllocationNode * curr = start;
      std::vector< std::string > string_vec;
      do {
        AllocationNode * next = curr->next;
        #if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET )
        // print node bitset
        for (int i=0; i < bitset_type::blocks; ++i ) {
          std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << "   ";
        }
        std::cout << std::endl;
        #endif
        // bit zero does not map to an AllocationRecord
        for ( int32_t i=1; i < bitset_type::size; ++i )
        {
          if (curr->bitset.test(i)) {
            AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
            std::ostringstream oss;
            alloc_rec->print( oss );
            string_vec.push_back( oss.str() );
 #if CLEAN_UP_MEMORY_LEAKS
 /* Cleaning up memory leaks prevents memory error detection tools
 * from reporting the original source of allocation, which can
 * impede debugging with such tools.
 */
            try {
              destroy(alloc_rec);
            }
            catch(...) {}
 #endif
          }
        }
        curr->next = NULL;
        delete curr;
        curr = next;
      } while ( curr != start );
      //if ( !string_vec.empty() ) {
      //  std::sort( string_vec.begin(), string_vec.end() );
      //
      //  std::ostringstream oss;
      //  oss << "Error: Allocation pool destroyed with the following memory leak(s):\n";
      //  for (size_t i=0; i< string_vec.size(); ++i)
      //  {
      //    oss << "   " << string_vec[i] << std::endl;
      //  }
      //
      //  std::cerr << oss.str() << std::endl;
      //}
    }
    // delete singletons
    {
      SingletonNode * curr = singleton_head;
      while (curr) {
        SingletonNode * next = curr->next;
        delete curr;
        curr = next;
      }
    }
  }
  AllocationRecord * create(  AllocatorBase * arg_allocator
                            , void * arg_alloc_ptr
                            , size_t arg_alloc_size
                            , const std::string & arg_label
                           )
  {
    AllocationNode * start = volatile_load(&head);
    AllocationNode * curr = start;
    int32_t node_index = curr->get_node_index();
    if (node_index == 0) {
      curr = volatile_load(&curr->next);
    }
    while (node_index == 0 && curr != start)
    {
      node_index = curr->get_node_index();
      if (node_index == 0) {
        curr = volatile_load(&curr->next);
      }
    }
    // Need to allocate and insert a new node
    if (node_index == 0 && curr == start)
    {
      AllocationNode * new_node = new AllocationNode();
      node_index = new_node->get_node_index();
      AllocationNode * next = NULL;
      do {
        next = volatile_load(&curr->next);
        new_node->next = next;
        memory_fence();
      } while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) );
      curr = new_node;
    }
    void * buffer = curr->get_buffer(node_index);
    // try to set head to curr
    if ( start != curr )
    {
      atomic_compare_exchange( & head, start, curr );
    }
    return new (buffer) AllocationRecord(  arg_allocator
                                         , arg_alloc_ptr
                                         , arg_alloc_size
                                         , node_index
                                         , arg_label
                                        );
  }
  void destroy( AllocationRecord * alloc_rec )
  {
    if (alloc_rec) {
      const int32_t node_index = alloc_rec->node_index;
      AllocationNode * node = get_node( alloc_rec );
      // deallocate memory
      alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size );
      // call destructor
      alloc_rec->~AllocationRecord();
      // wait for writes to complete
      memory_fence();
      // clear node index
      node->clear_node_index( node_index );
    }
  }
  void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
  {
    SingletonNode * node = new SingletonNode( size, create_func, destroy_func );
    SingletonNode * next;
    // insert new node at the head of the list
    do {
      next = volatile_load(&singleton_head);
      node->next = next;
    } while ( next != atomic_compare_exchange( &singleton_head, next, node ) );
    return node->buffer;
  }
  void print_memory( std::ostream & out ) const
  {
    AllocationNode * start = head;
    AllocationNode * curr = start;
    std::vector< std::string > string_vec;
    do {
      AllocationNode * next = curr->next;
      // bit zero does not map to an AllocationRecord
      for ( int32_t i=1; i < bitset_type::size; ++i )
      {
        if (curr->bitset.test(i)) {
          AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
          std::ostringstream oss;
          alloc_rec->print( oss );
          string_vec.push_back( oss.str() );
        }
      }
      curr = next;
    } while ( curr != start );
    if ( !string_vec.empty() ) {
      std::sort( string_vec.begin(), string_vec.end() );
      std::ostringstream oss;
      oss << "Tracked Memory:" << std::endl;
      for (size_t i=0; i< string_vec.size(); ++i)
      {
        oss << "   " << string_vec[i] << std::endl;
      }
      out << oss.str() << std::endl;
    }
    else {
      out << "No Tracked Memory" << std::endl;
    }
  }
  // find an AllocationRecord such that
  // alloc_ptr <= ptr < alloc_ptr + alloc_size
  // otherwise return NULL
  AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const
  {
    AllocationNode * start = head;
    AllocationNode * curr = start;
    char const * const char_ptr = reinterpret_cast<const char *>(ptr);
    do {
      AllocationNode * next = curr->next;
      // bit zero does not map to an AllocationRecord
      for ( int32_t i=1; i < bitset_type::size; ++i )
      {
        if (curr->bitset.test(i)) {
          AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
          char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr);
          if (   (allocator == alloc_rec->allocator)
              && (alloc_ptr <= char_ptr)
              && (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) )
          {
            return alloc_rec;
          }
        }
      }
      curr = next;
    } while ( curr != start );
    return NULL;
  }
 private:
  AllocationNode * get_node( AllocationRecord * alloc_rec )
  {
    return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index);
  }
  AllocationNode * head;
  SingletonNode * singleton_head;
 };
 // create the global pool for allocation records
 AllocationRecordPool global_alloc_rec_pool;
 // convert a uintptr_t to an AllocationRecord pointer
 inline
 AllocationRecord * to_alloc_rec( uintptr_t alloc_rec )
 {
  return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) );
 }
 } // unnamed namespace
 //-----------------------------------------------------------------------------
 // Allocation Tracker methods
 //-----------------------------------------------------------------------------
 // Create a reference counted AllocationTracker
 void AllocationTracker::initalize(  AllocatorBase * arg_allocator
                                  , void * arg_alloc_ptr
                                  , size_t arg_alloc_size
                                  , const std::string & arg_label
                                 )
 {
  if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) {
    // create record
    AllocationRecord * alloc_rec = global_alloc_rec_pool.create(  arg_allocator
                                                                , arg_alloc_ptr
                                                                , arg_alloc_size
                                                                , arg_label
                                                               );
    m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
  }
 }
 void AllocationTracker::reallocate( size_t size ) const
 {
  AllocationRecord * rec = to_alloc_rec( m_alloc_rec );
  void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size );
  if ( NULL != the_alloc_ptr )
  {
    *const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr;
    *const_cast<uint64_t *>(&rec->alloc_size) = size;
  }
  else {
    Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker");
  }
 }
 void AllocationTracker::increment_ref_count() const
 {
  to_alloc_rec( m_alloc_rec )->increment_ref_count();
 }
 void AllocationTracker::decrement_ref_count() const
 {
  AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec );
  uint32_t the_ref_count = alloc_rec->decrement_ref_count();
  if (the_ref_count == 0u) {
    try {
      global_alloc_rec_pool.destroy( alloc_rec );
    }
    catch(...) {}
  }
 }
 namespace {
 struct NullAllocator { static const char * name() { return "Null Allocator"; } };
 }
 AllocatorBase * AllocationTracker::allocator() const
 {
  if (m_alloc_rec & REF_COUNT_MASK) {
    return to_alloc_rec(m_alloc_rec)->allocator;
  }
  return Allocator<NullAllocator>::singleton();
 }
 void * AllocationTracker::alloc_ptr()  const
 {
  if (m_alloc_rec & REF_COUNT_MASK) {
    return to_alloc_rec(m_alloc_rec)->alloc_ptr;
  }
  return NULL;
 }
 size_t AllocationTracker::alloc_size() const
 {
  if (m_alloc_rec & REF_COUNT_MASK) {
    return to_alloc_rec(m_alloc_rec)->alloc_size;
  }
  return 0u;
 }
 size_t AllocationTracker::ref_count()  const
 {
  if (m_alloc_rec & REF_COUNT_MASK) {
    return to_alloc_rec(m_alloc_rec)->ref_count;
  }
  return 0u;
 }
 char const * AllocationTracker::label() const
 {
  if (m_alloc_rec & REF_COUNT_MASK) {
    return to_alloc_rec(m_alloc_rec)->label;
  }
  return "[Empty Allocation Tracker]";
 }
 void AllocationTracker::print( std::ostream & oss) const
 {
  if (m_alloc_rec & REF_COUNT_MASK) {
    to_alloc_rec(m_alloc_rec)->print(oss);
  }
  else {
    oss << label();
  }
 }
 bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const
 {
  bool result = false;
  if (m_alloc_rec & REF_COUNT_MASK) {
    result = to_alloc_rec(m_alloc_rec)->set_attribute(attr);
  }
  return result;
 }
 AllocatorAttributeBase * AllocationTracker::attribute() const
 {
  if (m_alloc_rec & REF_COUNT_MASK) {
    return to_alloc_rec(m_alloc_rec)->attribute;
  }
  return NULL;
 }
 void AllocationTracker::print_tracked_memory( std::ostream & out )
 {
  global_alloc_rec_pool.print_memory( out );
 }
 AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator )
 {
  AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator);
  AllocationTracker tracker;
  if ( alloc_rec != NULL )
  {
    if ( tracking_enabled() ) {
      alloc_rec->increment_ref_count();
      tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
    }
    else {
      tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec);
    }
  }
  return tracker ;
 }
 //-----------------------------------------------------------------------------
 // static AllocationTracker
 //-----------------------------------------------------------------------------
 #if defined( KOKKOS_USE_DECENTRALIZED_HOST )
 namespace {
  // TODO : Detect compiler support for thread local variables
  #if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
    bool g_thread_local_tracking_enabled = true;
    #pragma omp threadprivate(g_thread_local_tracking_enabled)
  #elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
    __thread bool g_thread_local_tracking_enabled = true;
  #elif defined( KOKKOS_HAVE_OPENMP )
    bool g_thread_local_tracking_enabled = true;
    #pragma omp threadprivate(g_thread_local_tracking_enabled)
  #elif defined( KOKKOS_HAVE_PTHREAD )
    __thread bool g_thread_local_tracking_enabled = true;
  #elif defined( KOKKOS_HAVE_SERIAL )
      bool g_thread_local_tracking_enabled = true;
  #endif
 } // unnamed namespace
 void AllocationTracker::disable_tracking()
 {
  g_thread_local_tracking_enabled = false;
 }
 void AllocationTracker::enable_tracking()
 {
  g_thread_local_tracking_enabled = true;
 }
 bool AllocationTracker::tracking_enabled()
 {
  return g_thread_local_tracking_enabled;
 }
 #else
 namespace {
 enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED };
 volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED;
 }
 void AllocationTracker::disable_tracking()
 {
  if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) {
    Impl::throw_runtime_exception("Error: Tracking already disabled");
  }
 }
 void AllocationTracker::enable_tracking()
 {
  if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) {
    Impl::throw_runtime_exception("Error: Tracking already enabled");
  }
 }
 bool AllocationTracker::tracking_enabled()
 {
  return g_tracking_enabled == TRACKING_ENABLED;
 }
 #endif
 //-----------------------------------------------------------------------------
 // create singleton free function
 //-----------------------------------------------------------------------------
 void * create_singleton(  size_t size
                        , Impl::singleton_create_function_type create_func
                        , Impl::singleton_destroy_function_type destroy_func )
 {
  return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func );
 }
 }} // namespace Kokkos::Impl
 #endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
--- a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
@ -1,574 +0,0 @@
 /*
 //@HEADER
 // ************************************************************************
 // 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 // 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 // 
 // ************************************************************************
 //@HEADER
 */
 #ifndef KOKKOS_ALLOCATION_TRACKER_HPP
 #define KOKKOS_ALLOCATION_TRACKER_HPP
 #include <Kokkos_Macros.hpp>
 #if ! KOKKOS_USING_EXP_VIEW
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <stdint.h>
 #include <cstdlib>
 #include <string>
 #include <iosfwd>
 namespace Kokkos { namespace Impl {
 //-----------------------------------------------------------------------------
 // Create Singleton objects
 //-----------------------------------------------------------------------------
 typedef void * (*singleton_create_function_type)(void * buffer);
 typedef void (*singleton_destroy_function_type)(void *);
 void * create_singleton(  size_t size
                        , singleton_create_function_type create_func
                        , singleton_destroy_function_type destroy_func
                       );
 /// class Singleton
 ///
 /// Default construct a singleton type.  This method is used to circumvent
 /// order of construction issues.  Singleton objects are destroyed after all
 /// other allocations in the reverse order of their creation.
 template <typename Type>
 class Singleton
 {
 public:
  /// Get a pointer to the Singleton. Default construct the singleton if it does not already exist
  static Type * get()
  {
    static Type * singleton = NULL;
    if (singleton == NULL) {
      Impl::singleton_create_function_type  create_func = &create;
      Impl::singleton_destroy_function_type destroy_func = &destroy;
      singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) );
    }
    return singleton;
  }
 private:
  /// Call the Type constructor
  static void destroy(void * ptr)
  {
    reinterpret_cast<Type*>(ptr)->~Type();
  }
  /// placement new the Type in buffer
  static void * create(void * buffer)
  {
    return new (buffer) Type();
  }
 };
 //-----------------------------------------------------------------------------
 // AllocatorBase
 //-----------------------------------------------------------------------------
 /// class AllocatorBase
 ///
 /// Abstract base class for all Allocators.
 /// Allocators should be singleton objects, use Singleton<Allocator>::get to create
 /// to avoid order of destruction issues
 class AllocatorBase
 {
 public:
  /// name of the allocator
  /// used to report memory leaks
  virtual const char * name() const = 0;
  /// Allocate a buffer of size number of bytes
  virtual void* allocate(size_t size) const = 0;
  /// Deallocate a buffer with size number of bytes
  /// The pointer must have been allocated with a call to corresponding allocate
  virtual void deallocate(void * ptr, size_t size) const = 0;
  /// Changes the size of the memory block pointed to by ptr.
  /// Ptr must have been allocated with the corresponding allocate call
  /// The function may move the memory block to a new location
  /// (whose address is returned by the function).
  ///
  /// The content of the memory block is preserved up to the lesser of the new and
  /// old sizes, even if the block is moved to a new location. If the new size is larger,
  /// the value of the newly allocated portion is indeterminate.
  ///
  /// In case that ptr is a null pointer, the function behaves like allocate, assigning a
  /// new block of size bytes and returning a pointer to its beginning.
  virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0;
  /// can a texture object be bound to the allocated memory
  virtual bool support_texture_binding() const = 0;
  /// virtual destructor
  virtual ~AllocatorBase() {}
 };
 /// class AllocatorAttributeBase
 class AllocatorAttributeBase
 {
 public:
  virtual ~AllocatorAttributeBase() {}
 };
 //-----------------------------------------------------------------------------
 // Allocator< StaticAllocator > : public AllocatorBase
 //-----------------------------------------------------------------------------
 // HasStaticName
 template<typename T>
 class HasStaticName
 {
  typedef const char * (*static_method)();
  template<typename U, static_method> struct SFINAE {};
  template<typename U> static char Test(SFINAE<U, &U::name>*);
  template<typename U> static int Test(...);
 public:
  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
 };
 template <typename T>
 inline
 typename enable_if<HasStaticName<T>::value, const char *>::type
 allocator_name()
 {
  return T::name();
 }
 template <typename T>
 inline
 typename enable_if<!HasStaticName<T>::value, const char *>::type
 allocator_name()
 {
  return "Unnamed Allocator";
 }
 // HasStaticAllocate
 template<typename T>
 class HasStaticAllocate
 {
  typedef void * (*static_method)(size_t);
  template<typename U, static_method> struct SFINAE {};
  template<typename U> static char Test(SFINAE<U, &U::allocate>*);
  template<typename U> static int Test(...);
 public:
  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
 };
 template <typename T>
 inline
 typename enable_if<HasStaticAllocate<T>::value, void *>::type
 allocator_allocate(size_t size)
 {
  return T::allocate(size);
 }
 template <typename T>
 inline
 typename enable_if<!HasStaticAllocate<T>::value, void *>::type
 allocator_allocate(size_t)
 {
  throw_runtime_exception(  std::string("Error: ")
                          + std::string(allocator_name<T>())
                          + std::string(" cannot allocate memory!") );
  return NULL;
 }
 // HasStaticDeallocate
 template<typename T>
 class HasStaticDeallocate
 {
  typedef void (*static_method)(void *, size_t);
  template<typename U, static_method> struct SFINAE {};
  template<typename U> static char Test(SFINAE<U, &U::deallocate>*);
  template<typename U> static int Test(...);
 public:
  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
 };
 template <typename T>
 inline
 typename enable_if<HasStaticDeallocate<T>::value, void>::type
 allocator_deallocate(void * ptr, size_t size)
 {
  T::deallocate(ptr,size);
 }
 template <typename T>
 inline
 typename enable_if<!HasStaticDeallocate<T>::value, void>::type
 allocator_deallocate(void *, size_t)
 {
  throw_runtime_exception(  std::string("Error: ")
                          + std::string(allocator_name<T>())
                          + std::string(" cannot deallocate memory!") );
 }
 // HasStaticReallocate
 template<typename T>
 class HasStaticReallocate
 {
  typedef void * (*static_method)(void *, size_t, size_t);
  template<typename U, static_method> struct SFINAE {};
  template<typename U> static char Test(SFINAE<U, &U::reallocate>*);
  template<typename U> static int Test(...);
 public:
  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
 };
 template <typename T>
 inline
 typename enable_if<HasStaticReallocate<T>::value, void *>::type
 allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size)
 {
  return T::reallocate(old_ptr, old_size, new_size);
 }
 template <typename T>
 inline
 typename enable_if<!HasStaticReallocate<T>::value, void *>::type
 allocator_reallocate(void *, size_t, size_t)
 {
  throw_runtime_exception(  std::string("Error: ")
                          + std::string(allocator_name<T>())
                          + std::string(" cannot reallocate memory!") );
  return NULL;
 }
 // HasStaticReallocate
 template<typename T>
 class HasStaticSupportTextureBinding
 {
  typedef bool (*static_method)();
  template<typename U, static_method> struct SFINAE {};
  template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*);
  template<typename U> static int Test(...);
 public:
  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
 };
 template <typename T>
 inline
 typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type
 allocator_support_texture_binding()
 {
  return T::support_texture_binding();
 }
 template <typename T>
 inline
 typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type
 allocator_support_texture_binding()
 {
  return false;
 }
 template <typename T>
 class Allocator : public AllocatorBase
 {
 public:
  virtual const char * name() const
  {
    return allocator_name<T>();
  }
  virtual void* allocate(size_t size) const
  {
    return allocator_allocate<T>(size);
  }
  virtual void deallocate(void * ptr, size_t size) const
  {
    allocator_deallocate<T>(ptr,size);
  }
  virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const
  {
    return allocator_reallocate<T>(old_ptr, old_size, new_size);
  }
  virtual bool support_texture_binding() const
  {
    return allocator_support_texture_binding<T>();
  }
  static AllocatorBase * singleton()
  {
    return Singleton< Allocator<T> >::get();
  }
 };
 //-----------------------------------------------------------------------------
 // AllocationTracker
 //-----------------------------------------------------------------------------
 // forward declaration for friend classes
 struct MallocHelper;
 /// class AllocationTracker
 /// Will call deallocate from the AllocatorBase when the reference count reaches 0.
 /// Reference counting is disabled when the host is in parallel.
 class AllocationTracker
 {
  // use the least significant bit of the AllocationRecord pointer to indicate if the
  // AllocationTracker should reference count
  enum {
     REF_COUNT_BIT = static_cast<uintptr_t>(1)
   , REF_COUNT_MASK = ~static_cast<uintptr_t>(1)
  };
 public:
  /// Find an AllocationTracker such that
  /// alloc_ptr <= ptr < alloc_ptr + alloc_size
  /// O(n) where n is the number of tracked allocations.
  template <typename StaticAllocator>
  static AllocationTracker find( void const * ptr )
  {
    return find( ptr, Allocator<StaticAllocator>::singleton() );
  }
  /// Pretty print all the currently tracked memory
  static void print_tracked_memory( std::ostream & out );
  /// Default constructor
  KOKKOS_INLINE_FUNCTION
  AllocationTracker()
    : m_alloc_rec(0)
  {}
  /// Create a AllocationTracker
  ///
  /// Start reference counting the alloc_ptr.
  /// When the reference count reachs 0 the allocator deallocate method
  /// will be call with the given size.  The alloc_ptr should have been
  /// allocated with the allocator's allocate method.
  ///
  /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
  /// do nothing
  template <typename StaticAllocator>
  AllocationTracker(  StaticAllocator const &
                    , void * arg_alloc_ptr
                    , size_t arg_alloc_size
                    , const std::string & arg_label = std::string("") )
    : m_alloc_rec(0)
  {
    AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
    initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
  }
  /// Create a AllocationTracker
  ///
  /// Start reference counting the alloc_ptr.
  /// When the reference count reachs 0 the allocator deallocate method
  /// will be call with the given size.  The alloc_ptr should have been
  /// allocated with the allocator's allocate method.
  ///
  /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
  /// do nothing
  template <typename StaticAllocator>
  AllocationTracker(  StaticAllocator const &
                    , size_t arg_alloc_size
                    , const std::string & arg_label = std::string("")
                   )
    : m_alloc_rec(0)
  {
    AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
    void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size );
    initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
  }
  /// Copy an AllocatorTracker
  KOKKOS_INLINE_FUNCTION
  AllocationTracker( const AllocationTracker & rhs )
    : m_alloc_rec( rhs.m_alloc_rec)
  {
 #if !defined( __CUDA_ARCH__ )
    if ( rhs.ref_counting() && tracking_enabled() ) {
      increment_ref_count();
    }
    else {
      m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
    }
 #else
    m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
 #endif
  }
  /// Copy an AllocatorTracker
  /// Decrement the reference count of the current tracker if necessary
  KOKKOS_INLINE_FUNCTION
  AllocationTracker & operator=( const AllocationTracker & rhs )
  {
    if (this != &rhs) {
 #if !defined( __CUDA_ARCH__ )
      if ( ref_counting() ) {
        decrement_ref_count();
      }
      m_alloc_rec = rhs.m_alloc_rec;
      if ( rhs.ref_counting() && tracking_enabled() ) {
        increment_ref_count();
      }
      else {
        m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
      }
 #else
      m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK;
 #endif
    }
    return * this;
  }
  /// Destructor
  /// Decrement the reference count if necessary
  KOKKOS_INLINE_FUNCTION
  ~AllocationTracker()
  {
 #if !defined( __CUDA_ARCH__ )
    if ( ref_counting() ) {
      decrement_ref_count();
    }
 #endif
  }
  /// Is the tracker valid?
  KOKKOS_INLINE_FUNCTION
  bool is_valid() const
  {
    return (m_alloc_rec & REF_COUNT_MASK);
  }
  /// clear the tracker
  KOKKOS_INLINE_FUNCTION
  void clear()
  {
 #if !defined( __CUDA_ARCH__ )
    if ( ref_counting() ) {
      decrement_ref_count();
    }
 #endif
    m_alloc_rec = 0;
  }
  /// is this tracker currently counting allocations?
  KOKKOS_INLINE_FUNCTION
  bool ref_counting() const
  {
    return (m_alloc_rec & REF_COUNT_BIT);
  }
  AllocatorBase * allocator() const;
  /// pointer to the allocated memory
  void * alloc_ptr()  const;
  /// size in bytes of the allocated memory
  size_t alloc_size() const;
  /// the current reference count
  size_t ref_count()  const;
  /// the label given to the allocation
  char const * label() const;
  /// pretty print all the tracker's information to the std::ostream
  void print( std::ostream & oss) const;
  /// set an attribute ptr on the allocation record
  /// the arg_attribute pointer will be deleted when the record is destroyed
  /// the attribute ptr can only be set once
  bool set_attribute( AllocatorAttributeBase * arg_attribute) const;
  /// get the attribute ptr from the allocation record
  AllocatorAttributeBase * attribute() const;
  /// reallocate the memory tracked by this allocation
  /// NOT thread-safe
  void reallocate( size_t size ) const;
  static void disable_tracking();
  static void enable_tracking();
  static bool tracking_enabled();
 private:
  static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
  void initalize(  AllocatorBase * arg_allocator
                 , void * arg_alloc_ptr
                 , size_t arg_alloc_size
                 , std::string const & label );
  void increment_ref_count() const;
  void decrement_ref_count() const;
  friend struct Impl::MallocHelper;
  uintptr_t m_alloc_rec;
 };
 }} // namespace Kokkos::Impl
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
 #endif //KOKKOS_ALLOCATION_TRACKER_HPP
--- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@ -0,0 +1,197 @@
 /*
 //@HEADER
 // ************************************************************************
 //
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 //
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 //
 // ************************************************************************
 //@HEADER
 */
 #ifndef KOKKOS_IMPL_ANALYZE_POLICY_HPP
 #define KOKKOS_IMPL_ANALYZE_POLICY_HPP
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_Concepts.hpp>
 #include <impl/Kokkos_Tags.hpp>
 namespace Kokkos { namespace Impl {
 template < typename ExecutionSpace   = void
         , typename Schedule         = void
         , typename WorkTag          = void
         , typename IndexType        = void
         , typename IterationPattern = void
         >
 struct PolicyTraitsBase
 {
  using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
  using execution_space   = ExecutionSpace;
  using schedule_type     = Schedule;
  using work_tag          = WorkTag;
  using index_type        = IndexType;
  using iteration_pattern = IterationPattern;
 };
 template <typename PolicyBase, typename ExecutionSpace>
 struct SetExecutionSpace
 {
  static_assert( is_void<typename PolicyBase::execution_space>::value
               , "Kokkos Error: More than one execution space given" );
  using type = PolicyTraitsBase< ExecutionSpace
                               , typename PolicyBase::schedule_type
                               , typename PolicyBase::work_tag
                               , typename PolicyBase::index_type
                               , typename PolicyBase::iteration_pattern
                               >;
 };
 template <typename PolicyBase, typename Schedule>
 struct SetSchedule
 {
  static_assert( is_void<typename PolicyBase::schedule_type>::value
               , "Kokkos Error: More than one schedule type given" );
  using type = PolicyTraitsBase< typename PolicyBase::execution_space
                               , Schedule
                               , typename PolicyBase::work_tag
                               , typename PolicyBase::index_type
                               , typename PolicyBase::iteration_pattern
                               >;
 };
 template <typename PolicyBase, typename WorkTag>
 struct SetWorkTag
 {
  static_assert( is_void<typename PolicyBase::work_tag>::value
               , "Kokkos Error: More than one work tag given" );
  using type = PolicyTraitsBase< typename PolicyBase::execution_space
                               , typename PolicyBase::schedule_type
                               , WorkTag
                               , typename PolicyBase::index_type
                               , typename PolicyBase::iteration_pattern
                               >;
 };
 template <typename PolicyBase, typename IndexType>
 struct SetIndexType
 {
  static_assert( is_void<typename PolicyBase::index_type>::value
               , "Kokkos Error: More than one index type given" );
  using type = PolicyTraitsBase< typename PolicyBase::execution_space
                               , typename PolicyBase::schedule_type
                               , typename PolicyBase::work_tag
                               , IndexType
                               , typename PolicyBase::iteration_pattern
                               >;
 };
 template <typename PolicyBase, typename IterationPattern>
 struct SetIterationPattern
 {
  static_assert( is_void<typename PolicyBase::iteration_pattern>::value
               , "Kokkos Error: More than one iteration_pattern given" );
  using type = PolicyTraitsBase< typename PolicyBase::execution_space
                               , typename PolicyBase::schedule_type
                               , typename PolicyBase::work_tag
                               , typename PolicyBase::index_type
                               , IterationPattern
                               >;
 };
 template <typename Base, typename... Traits>
 struct AnalyzePolicy;
 template <typename Base, typename T, typename... Traits>
 struct AnalyzePolicy<Base, T, Traits...> : public
  AnalyzePolicy<
      typename std::conditional< is_execution_space<T>::value  , SetExecutionSpace<Base,T>
    , typename std::conditional< is_schedule_type<T>::value    , SetSchedule<Base,T>
    , typename std::conditional< is_index_type<T>::value       , SetIndexType<Base,T>
    , typename std::conditional< std::is_integral<T>::value    , SetIndexType<Base, IndexType<T> >
    , typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
    , SetWorkTag<Base,T>
    >::type >::type >::type >::type>::type::type
  , Traits...
  >
 {};
 template <typename Base>
 struct AnalyzePolicy<Base>
 {
  using execution_space = typename std::conditional< is_void< typename Base::execution_space >::value
                                                   , DefaultExecutionSpace
                                                   , typename Base::execution_space
                                                   >::type;
  using schedule_type = typename std::conditional< is_void< typename Base::schedule_type >::value
                                                 , Schedule< Static >
                                                 , typename Base::schedule_type
                                                 >::type;
  using work_tag = typename Base::work_tag;
  using index_type = typename std::conditional< is_void< typename Base::index_type >::value
                                              , IndexType< typename execution_space::size_type >
                                              , typename Base::index_type
                                              >::type
                                               ::type // nasty hack to make index_type into an integral_type
                                              ;       // instead of the wrapped IndexType<T> for backwards compatibility
  using iteration_pattern = typename std::conditional< is_void< typename Base::iteration_pattern >::value
                                                     , void // TODO set default iteration pattern
                                                     , typename Base::iteration_pattern
                                                     >::type;
  using type = PolicyTraitsBase< execution_space
                               , schedule_type
                               , work_tag
                               , index_type
                               , iteration_pattern
                               >;
 };
 template <typename... Traits>
 struct PolicyTraits
  : public AnalyzePolicy< PolicyTraitsBase<>, Traits... >::type
 {};
 }} // namespace Kokkos::Impl
 #endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@ -218,7 +218,17 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
  while( !Impl::lock_address_host_space( (void*) dest ) );
  T return_val = *dest;
  if( return_val == compare ) {
-    const T tmp = *dest = val;
+    // Don't use the following line of code here:
    //
    //const T tmp = *dest = val;
    //
    // Instead, put each assignment in its own statement.  This is
    // because the overload of T::operator= for volatile *this should
    // return void, not volatile T&.  See Kokkos #177:
    //
    // https://github.com/kokkos/kokkos/issues/177
    *dest = val;
    const T tmp = *dest;
    #ifndef KOKKOS_COMPILER_CLANG
    (void) tmp;
    #endif
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@ -228,7 +228,17 @@ T atomic_exchange( volatile T * const dest ,
 {
  while( !Impl::lock_address_host_space( (void*) dest ) );
  T return_val = *dest;
-  const T tmp = *dest = val;
+  // Don't use the following line of code here:
  //
  //const T tmp = *dest = val;
  //
  // Instead, put each assignment in its own statement.  This is
  // because the overload of T::operator= for volatile *this should
  // return void, not volatile T&.  See Kokkos #177:
  //
  // https://github.com/kokkos/kokkos/issues/177
  *dest = val;
  const T tmp = *dest;
  #ifndef KOKKOS_COMPILER_CLANG
  (void) tmp;
  #endif
@ -305,7 +315,9 @@ void atomic_assign( volatile T * const dest ,
  // member.  The volatile return value implicitly defines a
  // dereference that some compilers (gcc 4.7.2) warn is being ignored.
  // Suppress warning by casting return to void.
-  (void)( *dest = val );
+  //(void)( *dest = val );
  *dest = val;
  Impl::unlock_address_host_space( (void*) dest );
 }
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@ -93,7 +93,7 @@ T atomic_fetch_add( volatile T * const dest ,
    assume.i = oldval.i ;
    newval.t = assume.t + val ;
    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
-  } while ( assumed.i != oldval.i );
+  } while ( assume.i != oldval.i );
  return oldval.t ;
 }
@ -156,9 +156,26 @@ T atomic_fetch_add( volatile T * const dest ,
 #elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
 #if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 )
 KOKKOS_INLINE_FUNCTION
 int atomic_fetch_add( volatile int * dest , const int val )
 {
        int original = val;
        __asm__ __volatile__(
                "lock xadd %1, %0"
                : "+m" (*dest), "+r" (original)
                : "m" (*dest), "r" (original)
                : "memory"
        );
        return original;
 }
 #else
 KOKKOS_INLINE_FUNCTION
 int atomic_fetch_add( volatile int * const dest , const int val )
-{ return __sync_fetch_and_add(dest,val); }
+{ return __sync_fetch_and_add(dest, val); }
 #endif
 KOKKOS_INLINE_FUNCTION
 long int atomic_fetch_add( volatile long int * const dest , const long int val )
@ -276,7 +293,17 @@ T atomic_fetch_add( volatile T * const dest ,
 {
  while( !Impl::lock_address_host_space( (void*) dest ) );
  T return_val = *dest;
-  const T tmp = *dest = return_val + val;
+  // Don't use the following line of code here:
  //
  //const T tmp = *dest = return_val + val;
  //
  // Instead, put each assignment in its own statement.  This is
  // because the overload of T::operator= for volatile *this should
  // return void, not volatile T&.  See Kokkos #177:
  //
  // https://github.com/kokkos/kokkos/issues/177
  *dest = return_val + val;
  const T tmp = *dest;
  (void) tmp;
  Impl::unlock_address_host_space( (void*) dest );
  return return_val;
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@ -73,7 +73,7 @@ T atomic_fetch_sub( volatile T * const dest ,
    assume.i = oldval.i ;
    newval.t = assume.t - val ;
    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
-  } while ( assumed.i != oldval.i );
+  } while ( assume.i != oldval.i );
  return oldval.t ;
 }
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@ -48,6 +48,22 @@
 namespace Kokkos {
 namespace Impl {
 template<class Scalar1, class Scalar2>
 struct MaxOper {
  KOKKOS_FORCEINLINE_FUNCTION
  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
    return (val1 > val2 ? val1 : val2);
  }
 };
 template<class Scalar1, class Scalar2>
 struct MinOper {
  KOKKOS_FORCEINLINE_FUNCTION
  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
    return (val1 < val2 ? val1 : val2);
  }
 };
 template<class Scalar1, class Scalar2>
 struct AddOper {
  KOKKOS_FORCEINLINE_FUNCTION
@ -276,6 +292,18 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
 namespace Kokkos {
 // Fetch_Oper atomics: return value before operation
 template < typename T >
 KOKKOS_INLINE_FUNCTION
 T atomic_fetch_max(volatile T * const dest, const T val) {
  return Impl::atomic_fetch_oper(Impl::MaxOper<T,const T>(),dest,val);
 }
 template < typename T >
 KOKKOS_INLINE_FUNCTION
 T atomic_fetch_min(volatile T * const dest, const T val) {
  return Impl::atomic_fetch_oper(Impl::MinOper<T,const T>(),dest,val);
 }
 template < typename T >
 KOKKOS_INLINE_FUNCTION
 T atomic_fetch_mul(volatile T * const dest, const T val) {
@ -326,6 +354,18 @@ T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
 // Oper Fetch atomics: return value after operation
 template < typename T >
 KOKKOS_INLINE_FUNCTION
 T atomic_max_fetch(volatile T * const dest, const T val) {
  return Impl::atomic_oper_fetch(Impl::MaxOper<T,const T>(),dest,val);
 }
 template < typename T >
 KOKKOS_INLINE_FUNCTION
 T atomic_min_fetch(volatile T * const dest, const T val) {
  return Impl::atomic_oper_fetch(Impl::MinOper<T,const T>(),dest,val);
 }
 template < typename T >
 KOKKOS_INLINE_FUNCTION
 T atomic_mul_fetch(volatile T * const dest, const T val) {
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@ -425,42 +425,6 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
  typedef int64_t type;
 };
 #if ! KOKKOS_USING_EXP_VIEW
 class AllocationTracker;
 // Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
 template<class ViewTraits>
 class ViewDataHandle<
  ViewTraits ,
  typename enable_if<
    ( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
    ( ViewTraits::memory_traits::Atomic )
  >::type >
 {
 private:
 //  typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) ||
 //                        (sizeof(typename ViewTraits::const_value_type)==8),
 //                         int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type
 //                   atomic_view_possible;
  typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
  typedef ViewDataHandle self_type;
 public:
  enum {  ReturnTypeIsReference = false };
  typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type;
  typedef Impl::AtomicDataElement<ViewTraits>    return_type;
  KOKKOS_INLINE_FUNCTION
  static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
  {
    return handle_type(arg_data_ptr);
  }
 };
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
 }} // namespace Kokkos::Impl
 #endif
--- a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
@ -1,287 +0,0 @@
 /*
 //@HEADER
 // ************************************************************************
 // 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
 // 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
 // 
 // ************************************************************************
 //@HEADER
 */
 #include <Kokkos_HostSpace.hpp>
 #if ! KOKKOS_USING_EXP_VIEW
 #include <impl/Kokkos_BasicAllocators.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <stdint.h>    // uintptr_t
 #include <cstdlib>     // for malloc, realloc, and free
 #include <cstring>     // for memcpy
 #if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
 #include <sys/mman.h>  // for mmap, munmap, MAP_ANON, etc
 #include <unistd.h>    // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
 #endif
 #include <sstream>
 namespace Kokkos { namespace Impl {
 /*--------------------------------------------------------------------------*/
 void* MallocAllocator::allocate( size_t size )
 {
  void * ptr = NULL;
  if (size) {
    ptr = malloc(size);
    if (!ptr)
    {
      std::ostringstream msg ;
      msg << name() << ": allocate(" << size << ") FAILED";
      throw_runtime_exception( msg.str() );
    }
  }
  return ptr;
 }
 void MallocAllocator::deallocate( void * ptr, size_t /*size*/ )
 {
  if (ptr) {
    free(ptr);
  }
 }
 void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
 {
  void * ptr = realloc(old_ptr, new_size);
  if (new_size > 0u && ptr == NULL) {
    throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
  }
  return ptr;
 }
 /*--------------------------------------------------------------------------*/
 namespace {
 void * raw_aligned_allocate( size_t size, size_t alignment )
 {
  void * ptr = NULL;
  if ( size ) {
 #if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
    ptr = _mm_malloc( size , alignment );
 #elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
    posix_memalign( & ptr, alignment , size );
 #else
    // Over-allocate to and round up to guarantee proper alignment.
    size_t size_padded = size + alignment + sizeof(void *);
    void * alloc_ptr = malloc( size_padded );
    if (alloc_ptr) {
      uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
      // offset enough to record the alloc_ptr
      address += sizeof(void *);
      uintptr_t rem = address % alignment;
      uintptr_t offset = rem ? (alignment - rem) : 0u;
      address += offset;
      ptr = reinterpret_cast<void *>(address);
      // record the alloc'd pointer
      address -= sizeof(void *);
      *reinterpret_cast<void **>(address) = alloc_ptr;
    }
 #endif
  }
  return ptr;
 }
 void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
 {
  if ( ptr ) {
 #if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
    _mm_free( ptr );
 #elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
    free( ptr );
 #else
    // get the alloc'd pointer
    void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1);
    free( alloc_ptr );
 #endif
  }
 }
 }
 void* AlignedAllocator::allocate( size_t size )
 {
  void * ptr = 0 ;
  if ( size ) {
    ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT);
    if (!ptr)
    {
      std::ostringstream msg ;
      msg << name() << ": allocate(" << size << ") FAILED";
      throw_runtime_exception( msg.str() );
    }
  }
  return ptr;
 }
 void AlignedAllocator::deallocate( void * ptr, size_t size )
 {
  raw_aligned_deallocate( ptr, size);
 }
 void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
 {
  void * ptr = old_ptr;;
  if (old_size < new_size) {
    ptr = allocate( new_size );
    memcpy(ptr, old_ptr, old_size );
    deallocate( old_ptr, old_size );
  }
  return ptr;
 }
 /*--------------------------------------------------------------------------*/
 // mmap flags for private anonymous memory allocation
 #if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
  #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
 #elif defined( MAP_ANON) && defined( MAP_PRIVATE )
  #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
 #else
  #define NO_MMAP
 #endif
 // huge page tables
 #if !defined( NO_MMAP )
  #if defined( MAP_HUGETLB )
    #define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB )
  #elif defined( MMAP_FLAGS )
    #define MMAP_FLAGS_HUGE MMAP_FLAGS
  #endif
  // threshold to use huge pages
  #define MMAP_USE_HUGE_PAGES (1u << 27)
 #endif
 // read write access to private memory
 #if !defined( NO_MMAP )
  #define MMAP_PROTECTION (PROT_READ | PROT_WRITE)
 #endif
 void* PageAlignedAllocator::allocate( size_t size )
 {
  void *ptr = NULL;
  if (size) {
 #if !defined NO_MMAP
    if ( size < MMAP_USE_HUGE_PAGES ) {
      ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/);
    } else {
      ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/);
    }
    if (ptr == MAP_FAILED) {
      ptr = NULL;
    }
 #else
    static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE )
    ptr = raw_aligned_allocate( size, page_size);
 #endif
    if (!ptr)
    {
      std::ostringstream msg ;
      msg << name() << ": allocate(" << size << ") FAILED";
      throw_runtime_exception( msg.str() );
    }
  }
  return ptr;
 }
 void PageAlignedAllocator::deallocate( void * ptr, size_t size )
 {
 #if !defined( NO_MMAP )
  munmap(ptr, size);
 #else
  raw_aligned_deallocate(ptr, size);
 #endif
 }
 void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
 {
  void * ptr = NULL;
 #if defined( NO_MMAP ) || defined( __APPLE__ ) || defined( __CYGWIN__ )
  if (old_size != new_size) {
    ptr = allocate( new_size );
    memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) );
    deallocate( old_ptr, old_size );
  }
  else {
    ptr = old_ptr;
  }
 #else
  ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE );
  if (ptr == MAP_FAILED) {
    throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory");
  }
 #endif
  return ptr;
 }
 }} // namespace Kokkos::Impl
 #endif /* #if ! KOKKOS_USING_EXP_VIEW */
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,3 @@`
							`tag: 2.01.00 date: 07:21:2016 master: xxxxxxxx develop: fa6dfcc4`
							`tag: 2.01.06 date: 09:02:2016 master: 9afaa87f develop: 555f1a3a`