Updating Kokkos lib to 2.03.00

2017-04-25 13:48:51 -06:00
parent 9f6e126a2f
commit 8910ec6e59
261 changed files with 27816 additions and 17799 deletions
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,28 @@
 # Change Log

+## [2.03.00](https://github.com/kokkos/kokkos/tree/2.03.00) (2017-04-25)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.15...2.03.00)
+
+**Implemented enhancements:**
+
+- UnorderedMap: make it accept Devices or MemorySpaces [\#711](https://github.com/kokkos/kokkos/issues/711)
+- sort to accept DynamicView and \[begin,end\) indices [\#691](https://github.com/kokkos/kokkos/issues/691)
+- ENABLE Macros should only be used via \#ifdef or \#if defined [\#675](https://github.com/kokkos/kokkos/issues/675)
+- Remove impl/Kokkos\_Synchronic\_\* [\#666](https://github.com/kokkos/kokkos/issues/666)
+- Turning off IVDEP for Intel 14.  [\#638](https://github.com/kokkos/kokkos/issues/638)
+- Using an installed Kokkos in a target application using CMake [\#633](https://github.com/kokkos/kokkos/issues/633)
+- Create Kokkos Bill of Materials [\#632](https://github.com/kokkos/kokkos/issues/632)
+- MDRangePolicy and tagged evaluators [\#547](https://github.com/kokkos/kokkos/issues/547)
+- Add PGI support [\#289](https://github.com/kokkos/kokkos/issues/289)
+
+**Fixed bugs:**
+
+- Output from PerTeam fails [\#733](https://github.com/kokkos/kokkos/issues/733)
+- Cuda: architecture flag not added to link line [\#688](https://github.com/kokkos/kokkos/issues/688)
+- Getting large chunks of memory for a thread team in a universal way [\#664](https://github.com/kokkos/kokkos/issues/664)
+- Kokkos RNG normal\(\) function hangs for small seed value [\#655](https://github.com/kokkos/kokkos/issues/655)
+- Kokkos Tests Errors on Shepard/HSW Builds [\#644](https://github.com/kokkos/kokkos/issues/644)
+
 ## [2.02.15](https://github.com/kokkos/kokkos/tree/2.02.15) (2017-02-10)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.07...2.02.15)

--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -98,10 +98,10 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
  )

 TRIBITS_ADD_OPTION_AND_DEFINE(
-  Kokkos_ENABLE_QTHREAD
-  KOKKOS_HAVE_QTHREAD
-  "Enable QTHREAD support in Kokkos."
-  "${TPL_ENABLE_QTHREAD}"
+  Kokkos_ENABLE_Qthreads
+  KOKKOS_HAVE_QTHREADS
+  "Enable Qthreads support in Kokkos."
+  "${TPL_ENABLE_QTHREADS}"
  )

 TRIBITS_ADD_OPTION_AND_DEFINE(
@ -213,4 +213,3 @@ TRIBITS_EXCLUDE_FILES(
  )

 TRIBITS_PACKAGE_POSTPROCESS()
-
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -1,39 +1,38 @@
-# Default settings common options
+# Default settings common options.

 #LAMMPS specific settings:
 KOKKOS_PATH=../../lib/kokkos
 CXXFLAGS=$(CCFLAGS)

-#Options: OpenMP,Serial,Pthreads,Cuda
+# Options: Cuda,OpenMP,Pthreads,Qthreads,Serial
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthreads"
-#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
+# Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
 KOKKOS_ARCH ?= ""
-#Options: yes,no
+# Options: yes,no
 KOKKOS_DEBUG ?= "no"
-#Options: hwloc,librt,experimental_memkind
+# Options: hwloc,librt,experimental_memkind
 KOKKOS_USE_TPLS ?= ""
-#Options: c++11,c++1z
+# Options: c++11,c++1z
 KOKKOS_CXX_STANDARD ?= "c++11"
-#Options: aggressive_vectorization,disable_profiling
+# Options: aggressive_vectorization,disable_profiling
 KOKKOS_OPTIONS ?= ""

-#Default settings specific options
-#Options: force_uvm,use_ldg,rdc,enable_lambda
+# Default settings specific options.
+# Options: force_uvm,use_ldg,rdc,enable_lambda
 KOKKOS_CUDA_OPTIONS ?= "enable_lambda"

-# Check for general settings
-
+# Check for general settings.
 KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l))
 KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l))
 KOKKOS_INTERNAL_ENABLE_CXX1Z := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++1z" | wc -l))

-# Check for external libraries
+# Check for external libraries.
 KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
 KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l))
 KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))

-# Check for advanced settings
+# Check for advanced settings.
 KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
 KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
 KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l))
@ -41,21 +40,21 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | gr
 KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "enable_lambda" | wc -l))

-# Check for Kokkos Host Execution Spaces one of which must be on
-
+# Check for Kokkos Host Execution Spaces one of which must be on.
 KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMP | wc -l))
 KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l))
+KOKKOS_INTERNAL_USE_QTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthreads | wc -l))
 KOKKOS_INTERNAL_USE_SERIAL := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Serial | wc -l))
-KOKKOS_INTERNAL_USE_QTHREAD := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthread | wc -l))

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 0)
  KOKKOS_INTERNAL_USE_SERIAL := 1
 endif
 endif
+endif

-# Check for other Execution Spaces
-
+# Check for other Execution Spaces.
 KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
@ -64,15 +63,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .)
 endif

-# Check OS
-
+# Check OS.
 KOKKOS_OS                      := $(shell uname -s)
 KOKKOS_INTERNAL_OS_CYGWIN      := $(shell uname -s | grep CYGWIN | wc -l)
 KOKKOS_INTERNAL_OS_LINUX       := $(shell uname -s | grep Linux  | wc -l)
 KOKKOS_INTERNAL_OS_DARWIN      := $(shell uname -s | grep Darwin | wc -l)

-# Check compiler
-
+# Check compiler.
 KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version       2>&1 | grep "Intel Corporation" | wc -l)
 KOKKOS_INTERNAL_COMPILER_PGI   := $(shell $(CXX) --version       2>&1 | grep PGI                 | wc -l)
 KOKKOS_INTERNAL_COMPILER_XL    := $(shell $(CXX) -qversion       2>&1 | grep XL                  | wc -l)
@ -95,6 +92,7 @@ endif

 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
  KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.')
+
  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
    ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0)
      $(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)
@ -103,7 +101,6 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
  endif
 endif

-
 ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
  KOKKOS_INTERNAL_OPENMP_FLAG := -mp
 else
@ -114,7 +111,7 @@ else
      KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
    else
      ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-        # OpenMP is turned on by default in Cray compiler environment
+        # OpenMP is turned on by default in Cray compiler environment.
        KOKKOS_INTERNAL_OPENMP_FLAG :=
      else
        KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
@ -138,9 +135,9 @@ else
  endif
 endif

-# Check for Kokkos Architecture settings
+# Check for Kokkos Architecture settings.

-#Intel based
+# Intel based.
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
@ -148,7 +145,7 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW |
 KOKKOS_INTERNAL_USE_ARCH_SKX := $(strip $(shell echo $(KOKKOS_ARCH) | grep SKX | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))

-#NVIDIA based
+# NVIDIA based.
 NVCC_WRAPPER := $(KOKKOS_PATH)/config/nvcc_wrapper
 KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l))
@ -170,9 +167,9 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
-KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
-KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
-KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
+  KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
+  KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
+  KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                                        + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                                        + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
                                                        + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
@ -183,33 +180,33 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
                                                        + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
 endif

-#ARM based
+# ARM based.
 KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv80 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv81 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8-ThunderX | wc -l))

-#IBM based
+# IBM based.
 KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power7 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power8 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power9 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))

-#AMD based
+# AMD based.
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))

-#Any AVX?
+# Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))

-# Decide what ISA level we are able to support
+# Decide what ISA level we are able to support.
 KOKKOS_INTERNAL_USE_ISA_X86_64    := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
 KOKKOS_INTERNAL_USE_ISA_KNC       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))

-#Incompatible flags?
+# Incompatible flags?
 KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)>1" | bc ))
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))

@ -220,7 +217,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1)
  $(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
 endif

-#Generating the list of Flags
+# Generating the list of Flags.

 KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src

@ -236,15 +233,19 @@ KOKKOS_LDFLAGS = -L$(shell pwd)
 KOKKOS_SRC =
 KOKKOS_HEADERS =

-#Generating the KokkosCore_config.h file
+# Generating the KokkosCore_config.h file.

 tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
 tmp := $(shell echo "Makefile constructed configuration:" >> KokkosCore_config.tmp)
 tmp := $(shell date >> KokkosCore_config.tmp)
 tmp := $(shell echo "----------------------------------------------*/" >> KokkosCore_config.tmp)

-
 tmp := $(shell echo "/* Execution Spaces */" >> KokkosCore_config.tmp)
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
  tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp)
 endif
@ -253,12 +254,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
  tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp )
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-	tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREADS 1" >> KokkosCore_config.tmp )
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-	tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
@ -279,12 +280,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
  tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	KOKKOS_CPPFLAGS += -I$(QTHREAD_PATH)/include
-	KOKKOS_LDFLAGS += -L$(QTHREAD_PATH)/lib 
-	tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREAD 1" >> KokkosCore_config.tmp )
-endif
-
 tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp)
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
@ -341,6 +336,7 @@ endif
 tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
  tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
 endif
@ -365,16 +361,19 @@ ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
      $(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
    endif
  endif
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
    tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
  endif
 endif
+
 endif

-#Add Architecture flags
+# Add Architecture flags.

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
    KOKKOS_CXXFLAGS +=
    KOKKOS_LDFLAGS +=
@ -391,6 +390,7 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp )
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
    KOKKOS_CXXFLAGS +=
    KOKKOS_LDFLAGS +=
@ -408,6 +408,7 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp )
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
    KOKKOS_CXXFLAGS +=
    KOKKOS_LDFLAGS +=
@ -424,6 +425,7 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -mavx
    KOKKOS_LDFLAGS  += -mavx
@ -435,7 +437,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
        KOKKOS_CXXFLAGS += -tp=sandybridge
        KOKKOS_LDFLAGS  += -tp=sandybridge
      else
-				# Assume that this is a really a GNU compiler
+        # Assume that this is a really a GNU compiler.
        KOKKOS_CXXFLAGS += -mavx
        KOKKOS_LDFLAGS  += -mavx
      endif
@ -445,10 +447,11 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

  else
-		# Assume that this is a really a GNU compiler or it could be XL on P8
+    # Assume that this is a really a GNU compiler or it could be XL on P8.
    KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
    KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
  endif
@ -456,10 +459,11 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_POWER9 1" >> KokkosCore_config.tmp )
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

  else
-		# Assume that this is a really a GNU compiler or it could be XL on P9
+    # Assume that this is a really a GNU compiler or it could be XL on P9.
    KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
    KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
  endif
@ -467,6 +471,7 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp )
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -xCORE-AVX2
    KOKKOS_LDFLAGS  += -xCORE-AVX2
@ -478,7 +483,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
        KOKKOS_CXXFLAGS += -tp=haswell
        KOKKOS_LDFLAGS  += -tp=haswell
      else
-				# Assume that this is a really a GNU compiler
+        # Assume that this is a really a GNU compiler.
        KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
        KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2
      endif
@ -488,6 +493,7 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512MIC 1" >> KokkosCore_config.tmp )
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -xMIC-AVX512
    KOKKOS_LDFLAGS  += -xMIC-AVX512
@ -498,7 +504,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
       ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

      else
-				# Asssume that this is really a GNU compiler
+        # Asssume that this is really a GNU compiler.
        KOKKOS_CXXFLAGS += -march=knl
        KOKKOS_LDFLAGS  += -march=knl
      endif
@ -508,6 +514,7 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp )
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -xCORE-AVX512
    KOKKOS_LDFLAGS  += -xCORE-AVX512
@ -518,7 +525,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

      else
-				# Nothing here yet
+        # Nothing here yet.
        KOKKOS_CXXFLAGS += -march=skylake-avx512
        KOKKOS_LDFLAGS  += -march=skylake-avx512
      endif
@ -532,67 +539,79 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
  KOKKOS_LDFLAGS += -mmic
 endif

-#Figure out the architecture flag for Cuda
+# Figure out the architecture flag for Cuda.
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+
 ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
  KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-arch
 endif
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-  KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-x cuda --cuda-gpu-arch
+  KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=--cuda-gpu-arch
+  KOKKOS_CXXFLAGS += -x cuda
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp )
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
 endif
+
 endif

 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
 ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
-KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
+  KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
 else
-KOKKOS_INTERNAL_NEW_CONFIG := 1
+  KOKKOS_INTERNAL_NEW_CONFIG := 1
 endif

 ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
@ -616,30 +635,34 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  KOKKOS_LIBS += -lcudart -lcuda
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-	KOKKOS_LIBS += -lpthread
-	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
-	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	KOKKOS_LIBS += -lqthread
-	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.cpp)
-	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
-endif
-
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+
  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
    KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
  else
    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
  endif
+
  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
 endif

-#Explicitly set the GCC Toolchain for Clang
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+  KOKKOS_LIBS += -lpthread
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
+  KOKKOS_CPPFLAGS += -I$(QTHREADS_PATH)/include
+  KOKKOS_LDFLAGS += -L$(QTHREADS_PATH)/lib
+  KOKKOS_LIBS += -lqthread
+endif
+
+# Explicitly set the GCC Toolchain for Clang.
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
  KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
  KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=)
@ -647,15 +670,15 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
  KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
 endif

-#With Cygwin functions such as fdopen and fileno are not defined 
-#when strict ansi is enabled. strict ansi gets enabled with --std=c++11
-#though. So we hard undefine it here. Not sure if that has any bad side effects
-#This is needed for gtest actually, not for Kokkos itself!
+# With Cygwin functions such as fdopen and fileno are not defined
+# when strict ansi is enabled. strict ansi gets enabled with --std=c++11
+# though. So we hard undefine it here. Not sure if that has any bad side effects
+# This is needed for gtest actually, not for Kokkos itself!
 ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1)
  KOKKOS_CXXFLAGS += -U__STRICT_ANSI__
 endif

-# Setting up dependencies
+# Setting up dependencies.

 KokkosCore_config.h:

--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -18,6 +18,8 @@ Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
 Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
 Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
 Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
@ -43,11 +45,11 @@ Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokk
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-Kokkos_QthreadExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
-Kokkos_Qthread_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+Kokkos_QthreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
+Kokkos_Qthreads_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@ -59,4 +61,3 @@ endif

 Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
-
--- a/lib/kokkos/README
+++ b/lib/kokkos/README
@ -45,31 +45,39 @@ Primary tested compilers on X86 are:
  GCC 4.8.4
  GCC 4.9.2
  GCC 5.1.0
+  GCC 5.2.0
  Intel 14.0.4
  Intel 15.0.2
  Intel 16.0.1
  Intel 17.0.098
+  Intel 17.1.132
  Clang 3.5.2
  Clang 3.6.1
+  Clang 3.7.1
+  Clang 3.8.1
  Clang 3.9.0
+  PGI 17.1

 Primary tested compilers on Power 8 are:
  GCC 5.4.0 (OpenMP,Serial)
  IBM XL 13.1.3 (OpenMP, Serial) (There is a workaround in place to avoid a compiler bug)

 Primary tested compilers on Intel KNL are:
+   GCC 6.2.0
   Intel 16.2.181 (with gcc 4.7.2)
   Intel 17.0.098 (with gcc 4.7.2)
+   Intel 17.1.132 (with gcc 4.9.3)
+   Intel 17.2.174 (with gcc 4.9.3)
+   Intel 18.0.061 (beta) (with gcc 4.9.3)

 Secondary tested compilers are:
-  CUDA 7.0 (with gcc 4.7.2)
-  CUDA 7.5 (with gcc 4.7.2)
+  CUDA 7.0 (with gcc 4.8.4)
+  CUDA 7.5 (with gcc 4.8.4)
  CUDA 8.0 (with gcc 5.3.0 on X86 and gcc 5.4.0 on Power8)
  CUDA/Clang 8.0 using Clang/Trunk compiler

 Other compilers working:
  X86:
-   PGI 15.4
   Cygwin 2.1.0 64bit with gcc 4.9.3

 Known non-working combinations:
--- a/lib/kokkos/algorithms/cmake/Dependencies.cmake
+++ b/lib/kokkos/algorithms/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_REQUIRED_PACKAGES KokkosCore
+  LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
  TEST_OPTIONAL_TPLS CUSPARSE
  )
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -547,7 +547,7 @@ namespace Kokkos {

    KOKKOS_INLINE_FUNCTION
    Random_XorShift64 (uint64_t state, int state_idx = 0)
-     : state_(state),state_idx_(state_idx){}
+     : state_(state==0?uint64_t(1318319):state),state_idx_(state_idx){}

    KOKKOS_INLINE_FUNCTION
    uint32_t urand() {
@ -719,6 +719,9 @@ namespace Kokkos {
    }

    void init(uint64_t seed, int num_states) {
+      if(seed==0)
+        seed = uint64_t(1318319);
+
      num_states_ = num_states;

      locks_ = lock_type("Kokkos::Random_XorShift64::locks",num_states_);
@ -968,8 +971,9 @@ namespace Kokkos {

    inline
    void init(uint64_t seed, int num_states) {
+      if(seed==0)
+        seed = uint64_t(1318319);
      num_states_ = num_states;
-
      locks_ = int_view_type("Kokkos::Random_XorShift1024::locks",num_states_);
      state_ = state_data_type("Kokkos::Random_XorShift1024::state",num_states_);
      p_ = int_view_type("Kokkos::Random_XorShift1024::p",num_states_);
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -53,69 +53,122 @@ namespace Kokkos {

  namespace Impl {

-  template<class ValuesViewType, int Rank=ValuesViewType::Rank>
+  template< class DstViewType , class SrcViewType
+          , int Rank = DstViewType::Rank >
  struct CopyOp;

-  template<class ValuesViewType>
-  struct CopyOp<ValuesViewType,1> {
-    template<class DstType, class SrcType>
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,1> {
    KOKKOS_INLINE_FUNCTION
-    static void copy(DstType& dst, size_t i_dst,
-                     SrcType& src, size_t i_src ) {
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
      dst(i_dst) = src(i_src);
    }
  };

-  template<class ValuesViewType>
-  struct CopyOp<ValuesViewType,2> {
-    template<class DstType, class SrcType>
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,2> {
    KOKKOS_INLINE_FUNCTION
-    static void copy(DstType& dst, size_t i_dst,
-                     SrcType& src, size_t i_src ) {
-      for(int j = 0;j< (int) dst.dimension_1(); j++)
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
+      for(int j = 0;j< (int) dst.extent(1); j++)
        dst(i_dst,j) = src(i_src,j);
    }
  };

-  template<class ValuesViewType>
-  struct CopyOp<ValuesViewType,3> {
-    template<class DstType, class SrcType>
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,3> {
    KOKKOS_INLINE_FUNCTION
-    static void copy(DstType& dst, size_t i_dst,
-                     SrcType& src, size_t i_src ) {
-      for(int j = 0; j<dst.dimension_1(); j++)
-        for(int k = 0; k<dst.dimension_2(); k++)
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
+      for(int j = 0; j<dst.extent(1); j++)
+        for(int k = 0; k<dst.extent(2); k++)
          dst(i_dst,j,k) = src(i_src,j,k);
    }
  };
  }

-template<class KeyViewType, class BinSortOp, class ExecutionSpace = typename KeyViewType::execution_space,
-         class SizeType = typename KeyViewType::memory_space::size_type>
+//----------------------------------------------------------------------------
+
+template< class KeyViewType
+        , class BinSortOp
+        , class Space = typename KeyViewType::device_type
+        , class SizeType = typename KeyViewType::memory_space::size_type
+        >
 class BinSort {
-
-
 public:
-  template<class ValuesViewType, class PermuteViewType, class CopyOp>
-  struct bin_sort_sort_functor {
-    typedef ExecutionSpace execution_space;
-    typedef typename ValuesViewType::non_const_type values_view_type;
-    typedef typename ValuesViewType::const_type const_values_view_type;
-    Kokkos::View<typename values_view_type::const_data_type,typename values_view_type::array_layout,
-                 typename values_view_type::memory_space,Kokkos::MemoryTraits<Kokkos::RandomAccess> > values;
-    values_view_type sorted_values;
-    typename PermuteViewType::const_type sort_order;
-    bin_sort_sort_functor(const_values_view_type values_, values_view_type  sorted_values_, PermuteViewType sort_order_):
-       values(values_),sorted_values(sorted_values_),sort_order(sort_order_) {}
+
+  template< class DstViewType , class SrcViewType >
+  struct copy_functor {
+
+    typedef typename SrcViewType::const_type  src_view_type ;
+
+    typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
+
+    DstViewType     dst_values ;
+    src_view_type   src_values ;
+    int             dst_offset ;
+
+    copy_functor( DstViewType  const & dst_values_
+                , int          const & dst_offset_
+                , SrcViewType  const & src_values_
+                )
+      : dst_values( dst_values_ )
+      , src_values( src_values_ )
+      , dst_offset( dst_offset_ )
+      {}

    KOKKOS_INLINE_FUNCTION
    void operator() (const int& i) const {
-      //printf("Sort: %i %i\n",i,sort_order(i));
-      CopyOp::copy(sorted_values,i,values,sort_order(i));
+      // printf("copy: dst(%i) src(%i)\n",i+dst_offset,i);
+      copy_op::copy(dst_values,i+dst_offset,src_values,i);
    }
  };

-  typedef ExecutionSpace execution_space;
+  template< class DstViewType
+          , class PermuteViewType
+          , class SrcViewType
+          >
+  struct copy_permute_functor {
+
+    // If a Kokkos::View then can generate constant random access
+    // otherwise can only use the constant type.
+
+    typedef typename std::conditional
+      < Kokkos::is_view< SrcViewType >::value
+      , Kokkos::View< typename SrcViewType::const_data_type
+                    , typename SrcViewType::array_layout
+                    , typename SrcViewType::device_type
+                    , Kokkos::MemoryTraits<Kokkos::RandomAccess>
+                    >
+      , typename SrcViewType::const_type
+      >::type src_view_type ;
+
+    typedef typename PermuteViewType::const_type  perm_view_type ;
+
+    typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
+
+    DstViewType     dst_values ;
+    perm_view_type  sort_order ;
+    src_view_type   src_values ;
+
+    copy_permute_functor( DstViewType     const & dst_values_
+                        , PermuteViewType const & sort_order_
+                        , SrcViewType     const & src_values_
+                        )
+      : dst_values( dst_values_ )
+      , sort_order( sort_order_ )
+      , src_values( src_values_ )
+      {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i)  const {
+      // printf("copy_permute: dst(%i) src(%i)\n",i,sort_order(i));
+      copy_op::copy(dst_values,i,src_values,sort_order(i));
+    }
+  };
+
+  typedef typename Space::execution_space  execution_space;
  typedef BinSortOp bin_op_type;

  struct bin_count_tag {};
@ -124,84 +177,137 @@ public:
  struct bin_sort_bins_tag {};

 public:
+
  typedef SizeType size_type;
  typedef size_type value_type;

-  typedef Kokkos::View<size_type*, execution_space> offset_type;
-  typedef Kokkos::View<const int*, execution_space> bin_count_type;
+  typedef Kokkos::View<size_type*, Space> offset_type;
+  typedef Kokkos::View<const int*, Space> bin_count_type;

+  typedef typename KeyViewType::const_type  const_key_view_type ;

-  typedef Kokkos::View<typename KeyViewType::const_data_type,
+  // If a Kokkos::View then can generate constant random access
+  // otherwise can only use the constant type.
+
+  typedef typename std::conditional
+    < Kokkos::is_view< KeyViewType >::value
+    , Kokkos::View< typename KeyViewType::const_data_type,
                    typename KeyViewType::array_layout,
-                       typename KeyViewType::memory_space> const_key_view_type;
-  typedef Kokkos::View<typename KeyViewType::const_data_type,
-                       typename KeyViewType::array_layout,
-                       typename KeyViewType::memory_space,
-                       Kokkos::MemoryTraits<Kokkos::RandomAccess> > const_rnd_key_view_type;
+                    typename KeyViewType::device_type,
+                    Kokkos::MemoryTraits<Kokkos::RandomAccess> >
+    , const_key_view_type
+    >::type const_rnd_key_view_type;

  typedef typename KeyViewType::non_const_value_type non_const_key_scalar;
  typedef typename KeyViewType::const_value_type     const_key_scalar;

+  typedef Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic_type ;
+
 private:
+
  const_key_view_type keys;
  const_rnd_key_view_type keys_rnd;

 public:
-  BinSortOp bin_op;

-  offset_type bin_offsets;
+  BinSortOp             bin_op ;
+  offset_type           bin_offsets ;
+  bin_count_atomic_type bin_count_atomic ;
+  bin_count_type        bin_count_const ;
+  offset_type           sort_order ;

-  Kokkos::View<int*, ExecutionSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic;
-  bin_count_type bin_count_const;
-
-  offset_type sort_order;
-
-  bool sort_within_bins;
+  int                   range_begin ;
+  int                   range_end ;
+  bool                  sort_within_bins ;

 public:

-  // Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
-  BinSort(const_key_view_type keys_, BinSortOp bin_op_,
-          bool sort_within_bins_ = false)
-     :keys(keys_),keys_rnd(keys_), bin_op(bin_op_) {
+  BinSort() {}

-    bin_count_atomic = Kokkos::View<int*, ExecutionSpace >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
+  //----------------------------------------
+  // Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
+  BinSort( const_key_view_type  keys_
+         , int                  range_begin_
+         , int                  range_end_
+         , BinSortOp            bin_op_
+         , bool                 sort_within_bins_ = false
+         )
+     : keys(keys_)
+     , keys_rnd(keys_)
+     , bin_op(bin_op_)
+     , bin_offsets()
+     , bin_count_atomic()
+     , bin_count_const()
+     , sort_order()
+     , range_begin( range_begin_ )
+     , range_end( range_end_ )
+     , sort_within_bins( sort_within_bins_ )
+  {
+    bin_count_atomic = Kokkos::View<int*, Space >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
    bin_count_const =  bin_count_atomic;
    bin_offsets =      offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
-    sort_order =       offset_type("PermutationVector",keys.dimension_0());
-    sort_within_bins = sort_within_bins_;
+    sort_order =       offset_type("PermutationVector",range_end-range_begin);
  }

+  BinSort( const_key_view_type  keys_
+         , BinSortOp            bin_op_
+         , bool                 sort_within_bins_ = false
+         )
+     : BinSort( keys_ , 0 , keys_.extent(0), bin_op_ , sort_within_bins_ ) {}
+
+  //----------------------------------------
  // Create the permutation vector, the bin_offset array and the bin_count array. Can be called again if keys changed
  void create_permute_vector() {
-    Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_count_tag>    (0,keys.dimension_0()),*this);
-    Kokkos::parallel_scan(Kokkos::RangePolicy<ExecutionSpace,bin_offset_tag>   (0,bin_op.max_bins()) ,*this);
+    const size_t len = range_end - range_begin ;
+    Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_count_tag>    (0,len),*this);
+    Kokkos::parallel_scan(Kokkos::RangePolicy<execution_space,bin_offset_tag>   (0,bin_op.max_bins()) ,*this);

    Kokkos::deep_copy(bin_count_atomic,0);
-    Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_binning_tag>  (0,keys.dimension_0()),*this);
+    Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_binning_tag>  (0,len),*this);

    if(sort_within_bins)
-      Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
+      Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
  }

  // Sort a view with respect ot the first dimension using the permutation array
  template<class ValuesViewType>
-  void sort(ValuesViewType values) {
-    ValuesViewType sorted_values = ValuesViewType("Copy",
-           values.dimension_0(),
-           values.dimension_1(),
-           values.dimension_2(),
-           values.dimension_3(),
-           values.dimension_4(),
-           values.dimension_5(),
-           values.dimension_6(),
-           values.dimension_7());
+  void sort( ValuesViewType const & values)
+  {
+    typedef
+      Kokkos::View< typename ValuesViewType::data_type,
+                    typename ValuesViewType::array_layout,
+                    typename ValuesViewType::device_type >
+        scratch_view_type ;

-    parallel_for(values.dimension_0(),
-        bin_sort_sort_functor<ValuesViewType, offset_type,
-                              Impl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
+    const size_t len = range_end - range_begin ;

-    deep_copy(values,sorted_values);
+    scratch_view_type
+      sorted_values("Scratch",
+                    len,
+                    values.extent(1),
+                    values.extent(2),
+                    values.extent(3),
+                    values.extent(4),
+                    values.extent(5),
+                    values.extent(6),
+                    values.extent(7));
+
+    {
+      copy_permute_functor< scratch_view_type /* DstViewType */
+                          , offset_type       /* PermuteViewType */
+                          , ValuesViewType    /* SrcViewType */
+                          >
+        functor( sorted_values , sort_order , values );
+
+      parallel_for( Kokkos::RangePolicy<execution_space>(0,len),functor);
+    }
+
+    {
+      copy_functor< ValuesViewType , scratch_view_type >
+        functor( values , range_begin , sorted_values );
+
+      parallel_for( Kokkos::RangePolicy<execution_space>(0,len),functor);
+    }
  }

  // Get the permutation vector
@ -217,9 +323,11 @@ public:
  bin_count_type get_bin_count() const {return bin_count_const;}

 public:
+
  KOKKOS_INLINE_FUNCTION
  void operator() (const bin_count_tag& tag, const int& i) const {
-    bin_count_atomic(bin_op.bin(keys,i))++;
+    const int j = range_begin + i ;
+    bin_count_atomic(bin_op.bin(keys,j))++;
  }

  KOKKOS_INLINE_FUNCTION
@ -232,10 +340,11 @@ public:

  KOKKOS_INLINE_FUNCTION
  void operator() (const bin_binning_tag& tag, const int& i)  const {
-    const int bin = bin_op.bin(keys,i);
+    const int j     = range_begin + i ;
+    const int bin   = bin_op.bin(keys,j);
    const int count = bin_count_atomic(bin)++;

-    sort_order(bin_offsets(bin) + count) = i;
+    sort_order(bin_offsets(bin) + count) = j ;
  }

  KOKKOS_INLINE_FUNCTION
@ -262,13 +371,19 @@ public:
  }
 };

+//----------------------------------------------------------------------------
+
 template<class KeyViewType>
 struct BinOp1D {
-  const int max_bins_;
-  const double mul_;
+  int max_bins_;
+  double mul_;
  typename KeyViewType::const_value_type range_;
  typename KeyViewType::const_value_type min_;

+  BinOp1D():max_bins_(0),mul_(0.0),
+            range_(typename KeyViewType::const_value_type()),
+            min_(typename KeyViewType::const_value_type()) {}
+
  //Construct BinOp with number of bins, minimum value and maxuimum value
  BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
                               typename KeyViewType::const_value_type max )
@ -302,12 +417,14 @@ struct BinOp3D {
  typename KeyViewType::non_const_value_type range_[3];
  typename KeyViewType::non_const_value_type min_[3];

+  BinOp3D() {}
+
  BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
                               typename KeyViewType::const_value_type max[] )
  {
-    max_bins_[0] = max_bins__[0]+1;
-    max_bins_[1] = max_bins__[1]+1;
-    max_bins_[2] = max_bins__[2]+1;
+    max_bins_[0] = max_bins__[0];
+    max_bins_[1] = max_bins__[1];
+    max_bins_[2] = max_bins__[2];
    mul_[0] = 1.0*max_bins__[0]/(max[0]-min[0]);
    mul_[1] = 1.0*max_bins__[1]/(max[1]-min[1]);
    mul_[2] = 1.0*max_bins__[2]/(max[2]-min[2]);
@ -364,7 +481,7 @@ bool try_std_sort(ViewType view) {
  possible  = possible && (ViewType::Rank == 1);
  possible  = possible && (stride[0] == 1);
  if(possible)  {
-   std::sort(view.ptr_on_device(),view.ptr_on_device()+view.dimension_0());
+   std::sort(view.data(),view.data()+view.extent(0));
  }
  return possible;
 }
@ -386,7 +503,8 @@ struct min_max_functor {
 }

 template<class ViewType>
-void sort(ViewType view, bool always_use_kokkos_sort = false) {
+void sort( ViewType const & view , bool const always_use_kokkos_sort = false)
+{
  if(!always_use_kokkos_sort) {
    if(Impl::try_std_sort(view)) return;
  }
@ -394,14 +512,37 @@ void sort(ViewType view, bool always_use_kokkos_sort = false) {

  Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
  Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
-  parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.dimension_0()),
+  parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.extent(0)),
                  Impl::min_max_functor<ViewType>(view),reducer);
  if(result.min_val == result.max_val) return;
-  BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,result.min_val,result.max_val),true);
+  BinSort<ViewType, CompType> bin_sort(view,CompType(view.extent(0)/2,result.min_val,result.max_val),true);
  bin_sort.create_permute_vector();
  bin_sort.sort(view);
 }

+template<class ViewType>
+void sort( ViewType view
+         , size_t const begin
+         , size_t const end
+         )
+{
+  typedef Kokkos::RangePolicy<typename ViewType::execution_space> range_policy ;
+  typedef BinOp1D<ViewType> CompType;
+
+  Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
+  Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
+
+  parallel_reduce( range_policy( begin , end )
+                 , Impl::min_max_functor<ViewType>(view),reducer );
+
+  if(result.min_val == result.max_val) return;
+
+  BinSort<ViewType, CompType>
+    bin_sort(view,begin,end,CompType((end-begin)/2,result.min_val,result.max_val),true);
+
+  bin_sort.create_permute_vector();
+  bin_sort.sort(view);
+}
 }

 #endif
--- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@ -44,6 +44,7 @@

 #include <gtest/gtest.h>
 #include<Kokkos_Core.hpp>
+#include<Kokkos_DynamicView.hpp>
 #include<Kokkos_Random.hpp>
 #include<Kokkos_Sort.hpp>

@ -192,17 +193,81 @@ void test_3D_sort(unsigned int n) {
  double epsilon = 1e-10;
  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;

+  if ( sort_fails )
    printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails);
+
  ASSERT_EQ(sort_fails,0);
  ASSERT_EQ(equal_sum,1);
 }

+//----------------------------------------------------------------------------
+
+template<class ExecutionSpace, typename KeyType>
+void test_dynamic_view_sort(unsigned int n )
+{
+  typedef typename ExecutionSpace::memory_space memory_space ;
+  typedef Kokkos::Experimental::DynamicView<KeyType*,ExecutionSpace> KeyDynamicViewType;
+  typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
+
+  const size_t upper_bound = 2 * n ;
+
+  typename KeyDynamicViewType::memory_pool
+    pool( memory_space() , 2 * n * sizeof(KeyType) );
+
+  KeyDynamicViewType keys("Keys",pool,upper_bound);
+
+  keys.resize_serial(n);
+
+  KeyViewType keys_view("KeysTmp", n );
+
+  // Test sorting array with all numbers equal
+  Kokkos::deep_copy(keys_view,KeyType(1));
+  Kokkos::Experimental::deep_copy(keys,keys_view);
+  Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys_view,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
+
+  Kokkos::Experimental::deep_copy(keys,keys_view);
+
+  double sum_before = 0.0;
+  double sum_after = 0.0;
+  unsigned int sort_fails = 0;
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_before);
+
+  Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
+
+  Kokkos::Experimental::deep_copy( keys_view , keys );
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_after);
+  Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys_view),sort_fails);
+
+  double ratio = sum_before/sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
+
+  if ( sort_fails != 0 || equal_sum != 1 ) {
+    std::cout << " N = " << n
+              << " ; sum_before = " << sum_before
+              << " ; sum_after = " << sum_after
+              << " ; ratio = " << ratio
+              << std::endl ;
+  }
+
+  ASSERT_EQ(sort_fails,0);
+  ASSERT_EQ(equal_sum,1);
+}
+
+//----------------------------------------------------------------------------
+
 template<class ExecutionSpace, typename KeyType>
 void test_sort(unsigned int N)
 {
  test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true);
  test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false);
  test_3D_sort<ExecutionSpace,KeyType>(N);
+  test_dynamic_view_sort<ExecutionSpace,KeyType>(N*N);
 }

 }
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@ -140,6 +140,9 @@ do
  #strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
  -pedantic|-Wpedantic|-ansi)
    ;;
+  #strip of -Woverloaded-virtual to avoid "cc1: warning: command line option ‘-Woverloaded-virtual’ is valid for C++/ObjC++ but not for C"
+  -Woverloaded-virtual)
+    ;;
  #strip -Xcompiler because we add it
  -Xcompiler)
    if [ $first_xcompiler_arg -eq 1 ]; then
@ -190,7 +193,7 @@ do
    object_files_xlinker="$object_files_xlinker -Xlinker $1"
    ;;
  #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
-  *.dylib)
+  @*|*.dylib)
    object_files="$object_files -Xlinker $1"
    object_files_xlinker="$object_files_xlinker -Xlinker $1"
    ;;
--- a/lib/kokkos/cmake/deps/QTHREADS.cmake
+++ b/lib/kokkos/cmake/deps/QTHREADS.cmake
@ -63,8 +63,7 @@
 #    Source:        https://code.google.com/p/qthreads
 #

-TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
  REQUIRED_HEADERS qthread.h
  REQUIRED_LIBS_NAMES "qthread"
  )
-
--- a/lib/kokkos/cmake/tpls/FindTPLQTHREADS.cmake
+++ b/lib/kokkos/cmake/tpls/FindTPLQTHREADS.cmake
@ -63,8 +63,7 @@
 #    Source:        https://code.google.com/p/qthreads
 #

-TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
  REQUIRED_HEADERS qthread.h
  REQUIRED_LIBS_NAMES "qthread"
  )
-
--- a/lib/kokkos/config/kokkos_dev/config-core-all.sh
+++ b/lib/kokkos/config/kokkos_dev/config-core-all.sh
@ -6,7 +6,7 @@
 #-----------------------------------------------------------------------------
 # Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
 #
-#   Cuda, OpenMP, Threads, Qthread, hwloc
+#   Cuda, OpenMP, Threads, Qthreads, hwloc
 #
 # module loaded on 'kokkos-dev.sandia.gov' for this build
 #
@ -82,13 +82,13 @@ CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
 CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"

 #-----------------------------------------------------------------------------
-# Qthread
+# Qthreads

-QTHREAD_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"
+QTHREADS_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"

-CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREAD:BOOL=ON"
-CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_INCLUDE_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/include"
-CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_LIBRARY_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/lib"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREADS:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREADS_INCLUDE_DIRS:FILEPATH=${QTHREADS_BASE_DIR}/include"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREADS_LIBRARY_DIRS:FILEPATH=${QTHREADS_BASE_DIR}/lib"

 #-----------------------------------------------------------------------------
 # C++11
@ -108,6 +108,3 @@ rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
 echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}

 cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
-
-#-----------------------------------------------------------------------------
-
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@ -5,3 +5,4 @@ tag:  2.02.00    date: 10:30:2016    master: 6c90a581    develop: ca3dd56e
 tag:  2.02.01    date: 11:01:2016    master: 9c698c86    develop: b0072304
 tag:  2.02.07    date: 12:16:2016    master: 4b4cc4ba    develop: 382c0966
 tag:  2.02.15    date: 02:10:2017    master: 8c64cd93    develop: 28dea8b6
+tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641 
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@ -6,7 +6,7 @@

 set -o pipefail

-# Determine current machine
+# Determine current machine.

 MACHINE=""
 HOSTNAME=$(hostname)
@ -45,10 +45,11 @@ CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limi
 INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 CUDA_WARNING_FLAGS=""

-# Default. Machine specific can override
+# Default. Machine specific can override.
 DEBUG=False
 ARGS=""
 CUSTOM_BUILD_LIST=""
+QTHREADS_PATH=""
 DRYRUN=False
 BUILD_ONLY=False
 declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
@ -60,74 +61,78 @@ PRINT_HELP=False
 OPT_FLAG=""
 KOKKOS_OPTIONS=""

-
 #
-# Handle arguments
+# Handle arguments.
 #

 while [[ $# > 0 ]]
 do
-key="$1"
-case $key in
--kokkos-path*)
-KOKKOS_PATH="${key#*=}"
-;;
--build-list*)
-CUSTOM_BUILD_LIST="${key#*=}"
-;;
--debug*)
-DEBUG=True
-;;
--build-only*)
-BUILD_ONLY=True
-;;
--test-script*)
-TEST_SCRIPT=True
-;;
--skip-hwloc*)
-SKIP_HWLOC=True
-;;
--num*)
-NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
-;;
--dry-run*)
-DRYRUN=True
-;;
--spot-check*)
-SPOT_CHECK=True
-;;
--arch*)
-ARCH_FLAG="--arch=${key#*=}"
-;;
--opt-flag*)
-OPT_FLAG="${key#*=}"
-;;
--with-cuda-options*)
-KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
-;;
--help*)
-PRINT_HELP=True
-;;
-*)
-# args, just append
-ARGS="$ARGS $1"
-;;
-esac
-shift
+  key="$1"
+
+  case $key in
+    --kokkos-path*)
+      KOKKOS_PATH="${key#*=}"
+      ;;
+    --qthreads-path*)
+      QTHREADS_PATH="${key#*=}"
+      ;;
+    --build-list*)
+      CUSTOM_BUILD_LIST="${key#*=}"
+      ;;
+    --debug*)
+      DEBUG=True
+      ;;
+    --build-only*)
+      BUILD_ONLY=True
+      ;;
+    --test-script*)
+      TEST_SCRIPT=True
+      ;;
+    --skip-hwloc*)
+      SKIP_HWLOC=True
+      ;;
+    --num*)
+      NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
+      ;;
+    --dry-run*)
+      DRYRUN=True
+      ;;
+    --spot-check*)
+      SPOT_CHECK=True
+      ;;
+    --arch*)
+      ARCH_FLAG="--arch=${key#*=}"
+      ;;
+    --opt-flag*)
+      OPT_FLAG="${key#*=}"
+      ;;
+    --with-cuda-options*)
+      KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
+      ;;
+    --help*)
+      PRINT_HELP=True
+      ;;
+    *)
+      # args, just append
+      ARGS="$ARGS $1"
+      ;;
+  esac
+
+  shift
 done

 SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )

-# set kokkos path
+# Set kokkos path.
 if [ -z "$KOKKOS_PATH" ]; then
  KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
 else
-    # Ensure KOKKOS_PATH is abs path
+  # Ensure KOKKOS_PATH is abs path.
  KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
 fi

 #
-# Machine specific config
+# Machine specific config.
 #

 if [ "$MACHINE" = "sems" ]; then
@ -153,21 +158,17 @@ if [ "$MACHINE" = "sems" ]; then
    # Format: (compiler module-list build-list exe-name warning-flag)
    COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
               "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
               "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
               "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
               "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
               "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
    )
  fi
-
 elif [ "$MACHINE" = "white" ]; then
  source /etc/profile.d/modules.sh
  SKIP_HWLOC=True
@ -177,7 +178,7 @@ elif [ "$MACHINE" = "white" ]; then
  IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
  CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"

-    # Don't do pthread on white
+  # Don't do pthread on white.
  GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"

  # Format: (compiler module-list build-list exe-name warning-flag)
@ -185,9 +186,11 @@ elif [ "$MACHINE" = "white" ]; then
             "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
             "cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
  )
+
  if [ -z "$ARCH_FLAG" ]; then
    ARCH_FLAG="--arch=Power8,Kepler37"
  fi
+
  NUM_JOBS_TO_RUN_IN_PARALLEL=2

 elif [ "$MACHINE" = "bowman" ]; then
@ -300,14 +303,14 @@ elif [ "$MACHINE" = "apollo" ]; then
  if [ -z "$ARCH_FLAG" ]; then
    ARCH_FLAG="--arch=SNB,Kepler35"
  fi
+
  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+
 else
  echo "Unhandled machine $MACHINE" >&2
  exit 1
 fi

-
-
 export OMP_NUM_THREADS=4

 declare -i NUM_RESULTS_TO_KEEP=7
@ -315,76 +318,78 @@ declare -i NUM_RESULTS_TO_KEEP=7
 RESULT_ROOT_PREFIX=TestAll

 if [ "$PRINT_HELP" = "True" ]; then
-echo "test_all_sandia <ARGS> <OPTIONS>:"
-echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
-echo "    Defaults to root repo containing this script"
-echo "--debug: Run tests in debug. Defaults to False"
-echo "--test-script: Test this script, not Kokkos"
-echo "--skip-hwloc: Do not do hwloc tests"
-echo "--num=N: Number of jobs to run in parallel"
-echo "--spot-check: Minimal test set to issue pull request"
-echo "--dry-run: Just print what would be executed"
-echo "--build-only: Just do builds, don't run anything"
-echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
-echo "--arch=ARCHITECTURE: overwrite architecture flags"
-echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
-echo "--build-list=BUILD,BUILD,BUILD..."
-echo "    Provide a comma-separated list of builds instead of running all builds"
-echo "    Valid items:"
-echo "      OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
-echo "      Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
-echo ""
+  echo "test_all_sandia <ARGS> <OPTIONS>:"
+  echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
+  echo "    Defaults to root repo containing this script"
+  echo "--debug: Run tests in debug. Defaults to False"
+  echo "--test-script: Test this script, not Kokkos"
+  echo "--skip-hwloc: Do not do hwloc tests"
+  echo "--num=N: Number of jobs to run in parallel"
+  echo "--spot-check: Minimal test set to issue pull request"
+  echo "--dry-run: Just print what would be executed"
+  echo "--build-only: Just do builds, don't run anything"
+  echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
+  echo "--arch=ARCHITECTURE: overwrite architecture flags"
+  echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
+  echo "--build-list=BUILD,BUILD,BUILD..."
+  echo "    Provide a comma-separated list of builds instead of running all builds"
+  echo "    Valid items:"
+  echo "      OpenMP, Pthread, Qthreads, Serial, OpenMP_Serial, Pthread_Serial"
+  echo "      Qthreads_Serial, Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
+  echo ""

-echo "ARGS: list of expressions matching compilers to test"
-echo "  supported compilers sems"
-for COMPILER_DATA in "${COMPILERS[@]}"; do
+  echo "ARGS: list of expressions matching compilers to test"
+  echo "  supported compilers sems"
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
    ARR=($COMPILER_DATA)
    COMPILER=${ARR[0]}
    echo "    $COMPILER"
-done
-echo ""
+  done
+  echo ""

-echo "Examples:"
-echo "  Run all tests"
-echo "  % test_all_sandia"
-echo ""
-echo "  Run all gcc tests"
-echo "  % test_all_sandia gcc"
-echo ""
-echo "  Run all gcc/4.7.2 and all intel tests"
-echo "  % test_all_sandia gcc/4.7.2 intel"
-echo ""
-echo "  Run all tests in debug"
-echo "  % test_all_sandia --debug"
-echo ""
-echo "  Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
-echo "  % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
-echo ""
-echo "If you want to kill the tests, do:"
-echo "  hit ctrl-z"
-echo "  % kill -9 %1"
-echo
-exit 0
+  echo "Examples:"
+  echo "  Run all tests"
+  echo "  % test_all_sandia"
+  echo ""
+  echo "  Run all gcc tests"
+  echo "  % test_all_sandia gcc"
+  echo ""
+  echo "  Run all gcc/4.7.2 and all intel tests"
+  echo "  % test_all_sandia gcc/4.7.2 intel"
+  echo ""
+  echo "  Run all tests in debug"
+  echo "  % test_all_sandia --debug"
+  echo ""
+  echo "  Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
+  echo "  % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
+  echo ""
+  echo "If you want to kill the tests, do:"
+  echo "  hit ctrl-z"
+  echo "  % kill -9 %1"
+  echo
+  exit 0
 fi

-# set build type
+# Set build type.
 if [ "$DEBUG" = "True" ]; then
  BUILD_TYPE=debug
 else
  BUILD_TYPE=release
 fi

-# If no args provided, do all compilers
+# If no args provided, do all compilers.
 if [ -z "$ARGS" ]; then
  ARGS='?'
 fi

-# Process args to figure out which compilers to test
+# Process args to figure out which compilers to test.
 COMPILERS_TO_TEST=""
+
 for ARG in $ARGS; do
  for COMPILER_DATA in "${COMPILERS[@]}"; do
    ARR=($COMPILER_DATA)
    COMPILER=${ARR[0]}
+
    if [[ "$COMPILER" = $ARG* ]]; then
      if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
        COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
@ -395,8 +400,35 @@ for ARG in $ARGS; do
  done
 done

+# Check if Qthreads build requested.
+HAVE_QTHREADS_BUILD="False"
+if [ -n "$CUSTOM_BUILD_LIST" ]; then
+  if [[ "$CUSTOM_BUILD_LIST" = *Qthreads* ]]; then
+    HAVE_QTHREADS_BUILD="True"
+  fi
+else
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    BUILD_LIST=${ARR[2]}
+    if [[ "$BUILD_LIST" = *Qthreads* ]]; then
+      HAVE_QTHREADS_BUILD="True"
+    fi
+  done
+fi
+
+# Ensure Qthreads path is set if Qthreads build is requested.
+if [ "$HAVE_QTHREADS_BUILD" = "True" ]; then
+  if [ -z "$QTHREADS_PATH" ]; then
+    echo "Need to supply Qthreads path (--qthreads-path) when testing Qthreads backend." >&2
+    exit 1
+  else
+    # Strip trailing slashes from path.
+    QTHREADS_PATH=$(echo $QTHREADS_PATH | sed 's/\/*$//')
+  fi
+fi
+
 #
-# Functions
+# Functions.
 #

 # get_compiler_name <COMPILER>
@ -409,7 +441,7 @@ get_compiler_version() {
  echo $1 | cut -d/ -f2
 }

-# Do not call directly
+# Do not call directly.
 get_compiler_data() {
  local compiler=$1
  local item=$2
@ -419,13 +451,14 @@ get_compiler_data() {
  local compiler_data
  for compiler_data in "${COMPILERS[@]}" ; do
    local arr=($compiler_data)
+
    if [ "$compiler" = "${arr[0]}" ]; then
      echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
      return 0
    fi
  done

-    # Not found
+  # Not found.
  echo "Unreconized compiler $compiler" >&2
  exit 1
 }
@ -459,14 +492,14 @@ run_cmd() {

 # report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
 report_and_log_test_result() {
-    # Use sane var names
+  # Use sane var names.
  local success=$1; local desc=$2; local comment=$3;

  if [ "$success" = "0" ]; then
    echo "  PASSED $desc"
    echo $comment > $PASSED_DIR/$desc
  else
-        # For failures, comment should be the name of the phase that failed
+    # For failures, comment should be the name of the phase that failed.
    echo "  FAILED $desc" >&2
    echo $comment > $FAILED_DIR/$desc
    cat ${desc}.${comment}.log
@ -494,16 +527,16 @@ setup_env() {

 # single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE>
 single_build_and_test() {
-    # Use sane var names
+  # Use sane var names.
  local compiler=$1; local build=$2; local build_type=$3;

-    # set up env
+  # Set up env.
  mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
  cd $ROOT_DIR/$compiler/"${build}-$build_type"
  local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
  setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }

-    # Set up flags
+  # Set up flags.
  local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
  local compiler_exe=$(get_compiler_exe_name $compiler)

@ -511,6 +544,14 @@ single_build_and_test() {
    local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
  fi

+  if [[ "$build" = *Qthreads* ]]; then
+    if [[ "$build_type" = hwloc* ]]; then
+      local extra_args="$extra_args --qthreads-path=${QTHREADS_PATH}_hwloc"
+    else
+      local extra_args="$extra_args --qthreads-path=$QTHREADS_PATH"
+    fi
+  fi
+
  if [[ "$OPT_FLAG" = "" ]]; then
    OPT_FLAG="-O3"
  fi
@ -522,11 +563,6 @@ single_build_and_test() {
    local cxxflags="$OPT_FLAG $compiler_warning_flags"
  fi

-    if [[ "$compiler" == cuda* ]]; then
-        cxxflags="--keep --keep-dir=$(pwd) $cxxflags"
-        export TMPDIR=$(pwd)
-    fi
-
  if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
    local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
  fi
@ -538,6 +574,7 @@ single_build_and_test() {
  if [ "$TEST_SCRIPT" = "True" ]; then
    local rand=$[ 1 + $[ RANDOM % 10 ]]
    sleep $rand
+
    if [ $rand -gt 5 ]; then
      run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
    fi
@ -547,6 +584,7 @@ single_build_and_test() {
    run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
    local -i build_end_time=$(date +%s)
    comment="build_time=$(($build_end_time-$build_start_time))"
+
    if [[ "$BUILD_ONLY" == False ]]; then
      run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
      local -i run_end_time=$(date +%s)
@ -576,7 +614,7 @@ run_in_background() {
  local compiler=$1

  local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
-    # don't override command line input
+  # Don't override command line input.
  # if [[ "$BUILD_ONLY" == True ]]; then
  #   num_jobs=8
  # else
@ -591,7 +629,7 @@ run_in_background() {

 # build_and_test_all <COMPILER>
 build_and_test_all() {
-    # Get compiler data
+  # Get compiler data.
  local compiler=$1
  if [ -z "$CUSTOM_BUILD_LIST" ]; then
    local compiler_build_list=$(get_compiler_build_list $compiler)
@ -599,13 +637,13 @@ build_and_test_all() {
    local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
  fi

-    # do builds
+  # Do builds.
  local build
  for build in $compiler_build_list
  do
    run_in_background $compiler $build $BUILD_TYPE

-        # If not cuda, do a hwloc test too
+    # If not cuda, do a hwloc test too.
    if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
      run_in_background $compiler $build "hwloc-$BUILD_TYPE"
    fi
@ -655,7 +693,7 @@ wait_summarize_and_exit() {
 }

 #
-# Main
+# Main.
 #

 ROOT_DIR=$(get_test_root_dir)
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@ -60,7 +60,7 @@ class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
 { 
 public:

-  typedef ViewTraits< DataType , P ... >  traits ;
+  typedef Kokkos::ViewTraits< DataType , P ... >  traits ;

 private:

@ -123,30 +123,41 @@ public:

  enum { Rank = 1 };

-  KOKKOS_INLINE_FUNCTION constexpr size_t size() const
+  KOKKOS_INLINE_FUNCTION
+  size_t size() const noexcept
    {
-      return
-        Kokkos::Impl::MemorySpaceAccess
+      uintptr_t n = 0 ;
+
+      if ( Kokkos::Impl::MemorySpaceAccess
            < Kokkos::Impl::ActiveExecutionMemorySpace
            , typename traits::memory_space
-          >::accessible 
-        ? // Runtime size is at the end of the chunk pointer array
-          (*reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max ))
-          << m_chunk_shift
-        : 0 ;
+            >::accessible ) {
+        n = *reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max );
+      }
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      else {
+        Kokkos::Impl::DeepCopy< Kokkos::HostSpace
+                              , typename traits::memory_space
+                              , Kokkos::HostSpace::execution_space >
+          ( & n
+          , reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max )
+          , sizeof(uintptr_t) );
+      }
+#endif
+      return n << m_chunk_shift ;
    }

  template< typename iType >
-  KOKKOS_INLINE_FUNCTION constexpr
+  KOKKOS_INLINE_FUNCTION
  size_t extent( const iType & r ) const
    { return r == 0 ? size() : 1 ; }

  template< typename iType >
-  KOKKOS_INLINE_FUNCTION constexpr
+  KOKKOS_INLINE_FUNCTION
  size_t extent_int( const iType & r ) const
    { return r == 0 ? size() : 1 ; }

-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return size(); }
+  KOKKOS_INLINE_FUNCTION size_t dimension_0() const { return size(); }
  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return 1 ; }
  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return 1 ; }
  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return 1 ; }
@ -270,10 +281,18 @@ public:
    }

  /** \brief  Resizing in serial can grow or shrink the array size, */
+  template< typename IntType >
  inline
-  void resize_serial( size_t n )
+  typename std::enable_if
+    < std::is_integral<IntType>::value &&
+      Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
+                                     , typename traits::memory_space
+                                     >::accessible
+    >::type
+  resize_serial( IntType const & n )
    {
-      DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
+      typedef typename traits::value_type value_type ;
+      typedef value_type * pointer_type ;

      const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;

@ -286,8 +305,8 @@ public:

      if ( *pc < NC ) {
        while ( *pc < NC ) {
-          m_chunks[*pc] =
-            m_pool.allocate( sizeof(traits::value_type) << m_chunk_shift );
+          m_chunks[*pc] = reinterpret_cast<pointer_type>
+            ( m_pool.allocate( sizeof(value_type) << m_chunk_shift ) );
          ++*pc ;
        }
      }
@ -295,12 +314,90 @@ public:
        while ( NC + 1 <= *pc ) {
          --*pc ;        
          m_pool.deallocate( m_chunks[*pc]
-                           , sizeof(traits::value_type) << m_chunk_shift );
+                           , sizeof(value_type) << m_chunk_shift );
          m_chunks[*pc] = 0 ;
        }
      }
    }

+  //----------------------------------------
+
+  struct ResizeSerial {
+    memory_pool                    m_pool ;
+    typename traits::value_type ** m_chunks ;
+    uintptr_t                    * m_pc ;
+    uintptr_t                      m_nc ;
+    unsigned                       m_chunk_shift ;  
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( int ) const
+      {
+        typedef typename traits::value_type value_type ;
+        typedef value_type * pointer_type ;
+
+        if ( *m_pc < m_nc ) {
+          while ( *m_pc < m_nc ) {
+            m_chunks[*m_pc] = reinterpret_cast<pointer_type>
+              ( m_pool.allocate( sizeof(value_type) << m_chunk_shift ) );
+            ++*m_pc ;
+          }
+        }
+        else {
+          while ( m_nc + 1 <= *m_pc ) {
+            --*m_pc ;        
+            m_pool.deallocate( m_chunks[*m_pc]
+                             , sizeof(value_type) << m_chunk_shift );
+            m_chunks[*m_pc] = 0 ;
+          }
+        }
+      }
+
+    ResizeSerial( memory_pool            const & arg_pool
+                , typename traits::value_type ** arg_chunks
+                , uintptr_t                    * arg_pc
+                , uintptr_t                      arg_nc
+                , unsigned                       arg_chunk_shift
+                )
+      : m_pool( arg_pool )
+      , m_chunks( arg_chunks )
+      , m_pc( arg_pc )
+      , m_nc( arg_nc )
+      , m_chunk_shift( arg_chunk_shift )
+      {}
+  };
+
+  template< typename IntType >
+  inline
+  typename std::enable_if
+    < std::is_integral<IntType>::value &&
+      ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
+                                       , typename traits::memory_space
+                                       >::accessible
+    >::type
+  resize_serial( IntType const & n )
+    {
+      const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
+
+      if ( m_chunk_max < NC ) {
+        Kokkos::abort("DynamicView::resize_serial exceeded maximum size");
+      }
+
+      // Must dispatch kernel
+
+      typedef Kokkos::RangePolicy< typename traits::execution_space > Range ;
+
+      uintptr_t * const pc =
+        reinterpret_cast<uintptr_t*>( m_chunks + m_chunk_max );
+
+      Kokkos::Impl::ParallelFor<ResizeSerial,Range>
+        closure( ResizeSerial( m_pool, m_chunks, pc, NC, m_chunk_shift )
+               , Range(0,1) );
+
+      closure.execute();
+
+      traits::execution_space::fence();
+    }
+
  //----------------------------------------------------------------------

  ~DynamicView() = default ;
@ -311,15 +408,17 @@ public:
  DynamicView & operator = ( const DynamicView & ) = default ;

  template< class RT , class ... RP >
-  KOKKOS_INLINE_FUNCTION
  DynamicView( const DynamicView<RT,RP...> & rhs )
    : m_pool( rhs.m_pool )
    , m_track( rhs.m_track )
-    , m_chunks( rhs.m_chunks )
+    , m_chunks( (typename traits::value_type **) rhs.m_chunks )
    , m_chunk_shift( rhs.m_chunk_shift )
    , m_chunk_mask( rhs.m_chunk_mask )
    , m_chunk_max( rhs.m_chunk_max )
    {
+      typedef typename DynamicView<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynamicView copy construction" );
    }

  //----------------------------------------------------------------------
@ -400,8 +499,6 @@ public:
    , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
    , m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
    {
-      DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
-
      // A functor to deallocate all of the chunks upon final destruction

      typedef typename traits::memory_space  memory_space ;
--- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@ -230,16 +230,17 @@ public:
  typedef typename Impl::remove_const<declared_value_type>::type value_type;
  typedef typename Impl::add_const<value_type>::type const_value_type;

-  typedef Device execution_space;
+  typedef Device device_type;
+  typedef typename Device::execution_space execution_space;
  typedef Hasher hasher_type;
  typedef EqualTo  equal_to_type;
  typedef uint32_t size_type;

  //map_types
-  typedef UnorderedMap<declared_key_type,declared_value_type,execution_space,hasher_type,equal_to_type> declared_map_type;
-  typedef UnorderedMap<key_type,value_type,execution_space,hasher_type,equal_to_type>                   insertable_map_type;
-  typedef UnorderedMap<const_key_type,value_type,execution_space,hasher_type,equal_to_type>             modifiable_map_type;
-  typedef UnorderedMap<const_key_type,const_value_type,execution_space,hasher_type,equal_to_type>       const_map_type;
+  typedef UnorderedMap<declared_key_type,declared_value_type,device_type,hasher_type,equal_to_type> declared_map_type;
+  typedef UnorderedMap<key_type,value_type,device_type,hasher_type,equal_to_type>                   insertable_map_type;
+  typedef UnorderedMap<const_key_type,value_type,device_type,hasher_type,equal_to_type>             modifiable_map_type;
+  typedef UnorderedMap<const_key_type,const_value_type,device_type,hasher_type,equal_to_type>       const_map_type;

  static const bool is_set = std::is_same<void,value_type>::value;
  static const bool has_const_key = std::is_same<const_key_type,declared_key_type>::value;
@ -264,18 +265,18 @@ private:
  typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type;

  typedef typename Impl::if_c<   is_insertable_map
-                               , View< key_type *, execution_space>
-                               , View< const key_type *, execution_space, MemoryTraits<RandomAccess> >
+                               , View< key_type *, device_type>
+                               , View< const key_type *, device_type, MemoryTraits<RandomAccess> >
                             >::type key_type_view;

  typedef typename Impl::if_c<   is_insertable_map || is_modifiable_map
-                               , View< impl_value_type *, execution_space>
-                               , View< const impl_value_type *, execution_space, MemoryTraits<RandomAccess> >
+                               , View< impl_value_type *, device_type>
+                               , View< const impl_value_type *, device_type, MemoryTraits<RandomAccess> >
                             >::type value_type_view;

  typedef typename Impl::if_c<   is_insertable_map
-                               , View< size_type *, execution_space>
-                               , View< const size_type *, execution_space, MemoryTraits<RandomAccess> >
+                               , View< size_type *, device_type>
+                               , View< const size_type *, device_type, MemoryTraits<RandomAccess> >
                             >::type size_type_view;

  typedef typename Impl::if_c<   is_insertable_map
@ -285,7 +286,7 @@ private:

  enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
  enum { num_scalars = 3 };
-  typedef View< int[num_scalars], LayoutLeft, execution_space> scalars_view;
+  typedef View< int[num_scalars], LayoutLeft, device_type> scalars_view;

 public:
  //! \name Public member functions
@ -757,7 +758,7 @@ public:

      Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);

-      typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, typename SDevice::memory_space > raw_deep_copy;
+      typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, typename SDevice::memory_space > raw_deep_copy;

      raw_deep_copy(tmp.m_hash_lists.ptr_on_device(), src.m_hash_lists.ptr_on_device(), sizeof(size_type)*src.m_hash_lists.dimension_0());
      raw_deep_copy(tmp.m_next_index.ptr_on_device(), src.m_next_index.ptr_on_device(), sizeof(size_type)*src.m_next_index.dimension_0());
@ -781,21 +782,21 @@ private: // private member functions

  void set_flag(int flag) const
  {
-    typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
    const int true_ = true;
    raw_deep_copy(m_scalars.ptr_on_device() + flag, &true_, sizeof(int));
  }

  void reset_flag(int flag) const
  {
-    typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
    const int false_ = false;
    raw_deep_copy(m_scalars.ptr_on_device() + flag, &false_, sizeof(int));
  }

  bool get_flag(int flag) const
  {
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename execution_space::memory_space > raw_deep_copy;
+    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > raw_deep_copy;
    int result = false;
    raw_deep_copy(&result, m_scalars.ptr_on_device() + flag, sizeof(int));
    return result;
--- a/lib/kokkos/containers/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt
@ -3,38 +3,49 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )

-SET(SOURCES
-  UnitTestMain.cpp 
-  TestCuda.cpp
-  )
-
 SET(LIBRARIES kokkoscore)

 IF(Kokkos_ENABLE_Pthread)
-  LIST( APPEND SOURCES
-    TestThreads.cpp
-  )
-ENDIF()
-
-IF(Kokkos_ENABLE_Serial)
-  LIST( APPEND SOURCES
-    TestSerial.cpp
-  )
-ENDIF()
-
-IF(Kokkos_ENABLE_OpenMP)
-  LIST( APPEND SOURCES
-    TestOpenMP.cpp
-  )
-ENDIF()
-
-
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest
-  SOURCES ${SOURCES}
+  UnitTest_Threads
+  SOURCES TestThreads.cpp UnitTestMain.cpp
  COMM serial mpi
  NUM_MPI_PROCS 1
  FAIL_REGULAR_EXPRESSION "  FAILED  "
  TESTONLYLIBS kokkos_gtest
  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Serial)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Serial
+  SOURCES TestSerial.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_OpenMP)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_OpenMP
+  SOURCES TestOpenMP.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Cuda)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Cuda
+  SOURCES TestCuda.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
+  )
+ENDIF()

--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@ -64,6 +64,7 @@ struct TestDynamicView
  typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;

  typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
+  typedef typename view_type::const_type const_view_type ;

  typedef typename Kokkos::TeamPolicy<execution_space>::member_type member_type ;
  typedef double value_type;
@ -136,6 +137,8 @@ struct TestDynamicView

    view_type da("A",pool,arg_total_size);

+    const_view_type ca(da);
+
 // printf("TestDynamicView::run(%d) construct test functor\n",arg_total_size);

    TestDynamicView functor(da,arg_total_size);
--- a/lib/kokkos/core/cmake/Dependencies.cmake
+++ b/lib/kokkos/core/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD DLlib
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib
  TEST_OPTIONAL_TPLS CUSPARSE
  )

--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in
@ -30,7 +30,7 @@

 #cmakedefine KOKKOS_HAVE_PTHREAD
 #cmakedefine KOKKOS_HAVE_SERIAL
-#cmakedefine KOKKOS_HAVE_QTHREAD
+#cmakedefine KOKKOS_HAVE_QTHREADS
 #cmakedefine KOKKOS_HAVE_Winthread
 #cmakedefine KOKKOS_HAVE_OPENMP
 #cmakedefine KOKKOS_HAVE_HWLOC
--- a/lib/kokkos/core/perf_test/Makefile
+++ b/lib/kokkos/core/perf_test/Makefile
@ -60,4 +60,3 @@ clean: kokkos-clean

 gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
-
--- a/lib/kokkos/core/perf_test/PerfTestCuda.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
@ -52,6 +52,8 @@

 #include <impl/Kokkos_Timer.hpp>

+#include <PerfTestMDRange.hpp>
+
 #include <PerfTestHexGrad.hpp>
 #include <PerfTestBlasKernels.hpp>
 #include <PerfTestGramSchmidt.hpp>
@ -72,6 +74,14 @@ class cuda : public ::testing::Test {
    }
 };

+//TEST_F( cuda, mdrange_lr ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutRight>( 5, 8, "Kokkos::Cuda" )) );
+//}
+
+//TEST_F( cuda, mdrange_ll ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutLeft>( 5, 8, "Kokkos::Cuda" )) );
+//}
+
 TEST_F( cuda, hexgrad )
 {
  EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
--- a/lib/kokkos/core/perf_test/PerfTestDriver.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestDriver.hpp
@ -60,6 +60,342 @@ namespace Test {

 enum { NUMBER_OF_TRIALS = 5 };

+template< class DeviceType , class LayoutType >
+void run_test_mdrange( int exp_beg , int exp_end, const char deviceTypeName[], int range_offset = 0,  int tile_offset = 0 )
+// exp_beg = 6 => 2^6 = 64 is starting range length
+{
+#define MDRANGE_PERFORMANCE_OUTPUT_VERBOSE 0
+
+  std::string label_mdrange ;
+  label_mdrange.append( "\"MDRange< double , " );
+  label_mdrange.append( deviceTypeName );
+  label_mdrange.append( " >\"" );
+
+  std::string label_range_col2 ;
+  label_range_col2.append( "\"RangeColTwo< double , " );
+  label_range_col2.append( deviceTypeName );
+  label_range_col2.append( " >\"" );
+
+  std::string label_range_col_all ;
+  label_range_col_all.append( "\"RangeColAll< double , " );
+  label_range_col_all.append( deviceTypeName );
+  label_range_col_all.append( " >\"" );
+
+  if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value) {
+    std::cout << "--------------------------------------------------------------\n"
+      << "Performance tests for MDRange Layout Right"
+      << "\n--------------------------------------------------------------" << std::endl;
+  } else {
+    std::cout << "--------------------------------------------------------------\n"
+      << "Performance tests for MDRange Layout Left"
+      << "\n--------------------------------------------------------------" << std::endl;
+  }
+
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    const int range_length = (1<<i) + range_offset;
+
+    std::cout << "\n--------------------------------------------------------------\n"
+      << "--------------------------------------------------------------\n"
+      << "MDRange Test:  range bounds: " << range_length << " , " << range_length << " , " << range_length 
+      << "\n--------------------------------------------------------------\n"
+      << "--------------------------------------------------------------\n";
+//      << std::endl;
+
+    int t0_min = 0, t1_min = 0, t2_min = 0;
+    double seconds_min = 0.0;
+
+    // Test 1: The MDRange in full
+    {
+    int t0 = 1, t1 = 1, t2 = 1;
+    int counter = 1;
+#if !defined(KOKKOS_HAVE_CUDA)
+    int min_bnd = 8;
+    int tfast = range_length;
+#else
+    int min_bnd = 2;
+    int tfast = 32;
+#endif
+    while ( tfast >= min_bnd ) {
+      int tmid = min_bnd;
+      while ( tmid < tfast ) { 
+        t0 = min_bnd;
+        t1 = tmid;
+        t2 = tfast;
+        int t2_rev = min_bnd;
+        int t1_rev = tmid;
+        int t0_rev = tfast;
+
+#if defined(KOKKOS_HAVE_CUDA)
+        //Note: Product of tile sizes must be < 1024 for Cuda
+        if ( t0*t1*t2 >= 1024 ) {
+          printf("  Exceeded Cuda tile limits; onto next range set\n\n");
+          break;
+        }
+#endif
+
+        // Run 1 with tiles LayoutRight style
+        double seconds_1 = 0;
+        { seconds_1 = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0, t1, t2) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+        std::cout << label_mdrange
+          << " , " << t0 << " , " << t1 << " , " << t2
+          << " , " << seconds_1
+          << std::endl ;
+#endif
+
+        if ( counter == 1 ) {
+          seconds_min = seconds_1;
+          t0_min = t0;
+          t1_min = t1;
+          t2_min = t2;
+        } 
+        else {
+          if ( seconds_1 < seconds_min ) 
+          { 
+            seconds_min = seconds_1; 
+            t0_min = t0;
+            t1_min = t1;
+            t2_min = t2;
+          }
+        }
+
+        // Run 2 with tiles LayoutLeft style - reverse order of tile dims
+        double seconds_1rev = 0;
+        { seconds_1rev = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0_rev, t1_rev, t2_rev) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+        std::cout << label_mdrange
+          << " , " << t0_rev << " , " << t1_rev << " , " << t2_rev
+          << " , " << seconds_1rev
+          << std::endl ;
+#endif
+
+        if ( seconds_1rev < seconds_min ) 
+        { 
+          seconds_min = seconds_1rev; 
+          t0_min = t0_rev;
+          t1_min = t1_rev;
+          t2_min = t2_rev;
+        }
+
+        ++counter;
+        tmid <<= 1;
+      } //end inner while
+      tfast >>=1;
+    } //end outer while
+
+    std::cout << "\n"
+      << "--------------------------------------------------------------\n"
+      << label_mdrange
+      << "\n Min values "
+      << "\n Range length per dim (3D): " << range_length
+      << "\n TileDims:  " << t0_min << " , " << t1_min << " , " << t2_min
+      << "\n Min time: " << seconds_min
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+    } //end scope
+
+#if !defined(KOKKOS_HAVE_CUDA)
+  double seconds_min_c = 0.0;
+  int t0c_min = 0, t1c_min = 0, t2c_min = 0;
+  int counter = 1;
+  {
+    int min_bnd = 8;
+    // Test 1_c: MDRange with 0 for 'inner' tile dim; this case will utilize the full span in that direction, should be similar to Collapse<2>
+    if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value ) {
+      for ( unsigned int T0 = min_bnd; T0 < static_cast<unsigned int>(range_length); T0<<=1 ) {
+        for ( unsigned int T1 = min_bnd; T1 < static_cast<unsigned int>(range_length); T1<<=1 ) {
+          double seconds_c = 0;
+          { seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, T0, T1, 0) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+          std::cout << " MDRange LR with '0' tile - collapse-like \n"
+          << label_mdrange
+          << " , " << T0 << " , " << T1 << " , " << range_length
+          << " , " << seconds_c
+          << std::endl ;
+#endif
+
+          t2c_min = range_length;
+          if ( counter == 1 ) {
+            seconds_min_c = seconds_c;
+            t0c_min = T0;
+            t1c_min = T1;
+          } 
+          else {
+            if ( seconds_c < seconds_min_c ) 
+            { 
+              seconds_min_c = seconds_c; 
+              t0c_min = T0;
+              t1c_min = T1;
+            }
+          }
+          ++counter;
+        }
+      }
+    }
+    else {
+      for ( unsigned int T1 = min_bnd; T1 <= static_cast<unsigned int>(range_length); T1<<=1 ) {
+        for ( unsigned int T2 = min_bnd; T2 <= static_cast<unsigned int>(range_length); T2<<=1 ) {
+          double seconds_c = 0;
+          { seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, 0, T1, T2) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+          std::cout << " MDRange LL with '0' tile - collapse-like \n"
+          << label_mdrange
+          << " , " <<range_length << " < " << T1 << " , " << T2
+          << " , " << seconds_c
+          << std::endl ;
+#endif
+
+
+          t0c_min = range_length;
+          if ( counter == 1 ) {
+            seconds_min_c = seconds_c;
+            t1c_min = T1;
+            t2c_min = T2;
+          } 
+          else {
+            if ( seconds_c < seconds_min_c ) 
+            { 
+              seconds_min_c = seconds_c; 
+              t1c_min = T1;
+              t2c_min = T2;
+            }
+          }
+          ++counter;
+        }
+      }
+    }
+
+    std::cout 
+//      << "--------------------------------------------------------------\n"
+      << label_mdrange
+      << "  Collapse<2> style: "
+      << "\n Min values "
+      << "\n Range length per dim (3D): " << range_length
+      << "\n TileDims:  " << t0c_min << " , " << t1c_min << " , " << t2c_min
+      << "\n Min time: " << seconds_min_c
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+  } //end scope test 2
+#endif
+
+
+    // Test 2: RangePolicy Collapse2 style
+    double seconds_2 = 0;
+    { seconds_2 = RangePolicyCollapseTwo< DeviceType , double , LayoutType >::test_index_collapse_two(range_length,range_length,range_length) ; }
+    std::cout << label_range_col2
+      << " , " << range_length
+      << " , " << seconds_2
+      << std::endl ;
+
+
+    // Test 3: RangePolicy Collapse all style - not necessary, always slow
+    /*
+    double seconds_3 = 0;
+    { seconds_3 = RangePolicyCollapseAll< DeviceType , double , LayoutType >::test_collapse_all(range_length,range_length,range_length) ; }
+    std::cout << label_range_col_all
+      << " , " << range_length
+      << " , " << seconds_3
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+    */
+
+    // Compare fastest times... will never be collapse all so ignore it
+    // seconds_min = tiled MDRange
+    // seconds_min_c = collapse<2>-like MDRange (tiledim = span for fast dim) - only for non-Cuda, else tile too long
+    // seconds_2 = collapse<2>-style RangePolicy
+    // seconds_3 = collapse<3>-style RangePolicy
+
+#if !defined(KOKKOS_HAVE_CUDA)
+    if ( seconds_min < seconds_min_c ) {
+      if ( seconds_min < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange tiled\n"
+          << " Time: " << seconds_min
+          << " Difference: " << seconds_2 - seconds_min
+          << " Other times: \n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+    }
+    else if ( seconds_min > seconds_min_c ) {
+      if ( seconds_min_c < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange collapse-like (tiledim = span on fast dim) type\n"
+          << " Time: " << seconds_min_c
+          << " Difference: " << seconds_2 - seconds_min_c
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min_c > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min_c - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+    } // end else if
+#else
+      if ( seconds_min < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange tiled\n"
+          << " Time: " << seconds_min
+          << " Difference: " << seconds_2 - seconds_min
+          << " Other times: \n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+#endif
+
+  } //end for
+
+#undef MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+
+}


 template< class DeviceType >
--- a/lib/kokkos/core/perf_test/PerfTestHost.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestHost.cpp
@ -66,6 +66,8 @@ const char TestHostDeviceName[] = "Kokkos::Serial" ;

 #include <impl/Kokkos_Timer.hpp>

+#include <PerfTestMDRange.hpp>
+
 #include <PerfTestHexGrad.hpp>
 #include <PerfTestBlasKernels.hpp>
 #include <PerfTestGramSchmidt.hpp>
@ -102,6 +104,14 @@ protected:
  }
 };

+//TEST_F( host, mdrange_lr ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutRight> (5, 8, TestHostDeviceName) ) );
+//}
+
+//TEST_F( host, mdrange_ll ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutLeft> (5, 8, TestHostDeviceName) ) );
+//}
+
 TEST_F( host, hexgrad ) {
  EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName ));
 }
--- a/lib/kokkos/core/perf_test/PerfTestMDRange.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestMDRange.hpp
@ -0,0 +1,564 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+namespace Test {
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct MultiDimRangePerf3D
+{
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+
+  using iterate_type = Kokkos::Experimental::Iterate;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  MultiDimRangePerf3D(const view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long i, const long j, const long k) const
+  {
+    A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                             + B(i,j+2,k) + B(i,j+1,k)
+                             + B(i,j,k+2) + B(i,j,k+1)
+                             + B(i,j,k) );
+  }
+
+
+  struct InitZeroTag {};
+//  struct InitViewTag {};
+
+  struct Init
+  {
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long i, const long j, const long k) const
+    {
+      input(i,j,k) = 1.0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const InitZeroTag&, const long i, const long j, const long k) const
+    {
+      input(i,j,k) = 0;
+    }
+
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+  };
+
+
+  static double test_multi_index(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const unsigned int Ti = 1, const unsigned int Tj = 1, const unsigned int Tk = 1, const long iter = 1)
+  {
+    //This test performs multidim range over all dims
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef MultiDimRangePerf3D<execution_space,ScalarType,TestLayout> FunctorType;
+
+    double dt_min = 0;
+
+    // LayoutRight
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}}); 
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}}); 
+
+      typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > MDRangeType;
+      using tile_type = typename MDRangeType::tile_type;
+      using point_type = typename MDRangeType::point_type;
+
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
+
+      Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
+      execution_space::fence();
+      Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
+      execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - only the first run
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  multi Ahost = " << Ahost(l,j,k) << "  expected = " << check  
+                      << "  multi Bhost(ijk) = " << Bhost(l,j,k) 
+                      << "  multi Bhost(l+1jk) = " << Bhost(l+1,j,k) 
+                      << "  multi Bhost(l+2jk) = " << Bhost(l+2,j,k) 
+                      << "  multi Bhost(ij+1k) = " << Bhost(l,j+1,k) 
+                      << "  multi Bhost(ij+2k) = " << Bhost(l,j+2,k) 
+                      << "  multi Bhost(ijk+1) = " << Bhost(l,j,k+1) 
+                      << "  multi Bhost(ijk+2) = " << Bhost(l,j,k+2) 
+                      << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << "LR multi: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " multi: No errors!" <<  std::endl; }
+      }
+    } //end for
+
+    } 
+    // LayoutLeft
+    else {
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}}); 
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}}); 
+
+      //typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > MDRangeType;
+      //using tile_type = typename MDRangeType::tile_type;
+      //using point_type = typename MDRangeType::point_type;
+      //Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}} ); 
+
+      Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
+      execution_space::fence();
+      Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
+      execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - only the first run
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  multi Ahost = " << Ahost(l,j,k) << "  expected = " << check  
+                      << "  multi Bhost(ijk) = " << Bhost(l,j,k) 
+                      << "  multi Bhost(l+1jk) = " << Bhost(l+1,j,k) 
+                      << "  multi Bhost(l+2jk) = " << Bhost(l+2,j,k) 
+                      << "  multi Bhost(ij+1k) = " << Bhost(l,j+1,k) 
+                      << "  multi Bhost(ij+2k) = " << Bhost(l,j+2,k) 
+                      << "  multi Bhost(ijk+1) = " << Bhost(l,j,k+1) 
+                      << "  multi Bhost(ijk+2) = " << Bhost(l,j,k+2) 
+                      << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " LL multi run: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " multi: No errors!" <<  std::endl; }
+
+      }
+    } //end for
+    }
+
+    return dt_min;
+  } 
+
+};
+
+
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct RangePolicyCollapseTwo
+{
+  // RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for multi-dim; unroll 2 dims in one-dim
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+  typedef TestLayout layout;
+
+  using iterate_type = Kokkos::Experimental::Iterate;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  RangePolicyCollapseTwo(view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_) , irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long r) const
+  {
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+    {
+//id(i,j,k) = k + j*Nk + i*Nk*Nj = k + Nk*(j + i*Nj) = k + Nk*r
+//r = j + i*Nj
+      long i = int(r / jrange); 
+      long j = int( r - i*jrange);
+      for (int k = 0; k < krange; ++k) {
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                                 + B(i,j+2,k) + B(i,j+1,k)
+                                 + B(i,j,k+2) + B(i,j,k+1)
+                                 + B(i,j,k) );
+      }
+    }
+    else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+    {
+//id(i,j,k) = i + j*Ni + k*Ni*Nj = i + Ni*(j + k*Nj) = i + Ni*r
+//r = j + k*Nj
+      long k = int(r / jrange); 
+      long j = int( r - k*jrange);
+      for (int i = 0; i < irange; ++i) {
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                                 + B(i,j+2,k) + B(i,j+1,k)
+                                 + B(i,j,k+2) + B(i,j,k+1)
+                                 + B(i,j,k) );
+      }
+    }
+  }
+
+
+  struct Init
+  {
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long r) const
+    {
+      if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+      {
+        long i = int(r / jrange); 
+        long j = int( r - i*jrange);
+        for (int k = 0; k < krange; ++k) {
+          input(i,j,k) = 1;
+        }
+      }
+      else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+      {
+        long k = int(r / jrange); 
+        long j = int( r - k*jrange);
+        for (int i = 0; i < irange; ++i) {
+          input(i,j,k) = 1;
+        }
+      }
+    }
+  };
+
+
+  static double test_index_collapse_two(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
+  {
+    // This test refers to collapsing two dims while using the RangePolicy
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef RangePolicyCollapseTwo<execution_space,ScalarType,TestLayout> FunctorType;
+
+    long collapse_index_rangeA = 0;
+    long collapse_index_rangeB = 0;
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
+      collapse_index_rangeA = icount*jcount;
+      collapse_index_rangeB = (icount+2)*(jcount+2);
+//      std::cout << "   LayoutRight " << std::endl;
+    } else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value ) {
+      collapse_index_rangeA = kcount*jcount;
+      collapse_index_rangeB = (kcount+2)*(jcount+2);
+//      std::cout << "   LayoutLeft " << std::endl;
+    } else {
+      std::cout << "  LayoutRight or LayoutLeft required - will pass 0 as range instead " << std::endl;
+      exit(-1);
+    }
+
+    Kokkos::RangePolicy<execution_space> policy(0, (collapse_index_rangeA) );
+    Kokkos::RangePolicy<execution_space> policy_initB(0, (collapse_index_rangeB) );
+
+    double dt_min = 0;
+
+    Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
+    execution_space::fence();
+    Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
+    execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - first iteration only
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  flat Ahost = " << Ahost(l,j,k) << "  expected = " << check  << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " RP collapse2: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " RP collapse2: Pass! " << std::endl; }
+      }
+    }
+
+    return dt_min;
+  } 
+
+};
+
+
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct RangePolicyCollapseAll
+{
+  // RangePolicy for 3D range, but will collapse all dims
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+  typedef TestLayout layout;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  RangePolicyCollapseAll(view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long r) const
+  {
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+    {
+      long i = int(r / (jrange*krange)); 
+      long j = int(( r - i*jrange*krange)/krange);
+      long k = int(r - i*jrange*krange - j*krange);
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+            + B(i,j+2,k) + B(i,j+1,k)
+            + B(i,j,k+2) + B(i,j,k+1)
+            + B(i,j,k) );
+    }
+    else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+    {
+      long k = int(r / (irange*jrange)); 
+      long j = int(( r - k*irange*jrange)/irange);
+      long i = int(r - k*irange*jrange - j*irange);
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+            + B(i,j+2,k) + B(i,j+1,k)
+            + B(i,j,k+2) + B(i,j,k+1)
+            + B(i,j,k) );
+    }
+  }
+
+
+  struct Init
+  {
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long r) const
+    {
+      if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+      {
+        long i = int(r / (jrange*krange)); 
+        long j = int(( r - i*jrange*krange)/krange);
+        long k = int(r - i*jrange*krange - j*krange);
+        input(i,j,k) = 1;
+      }
+      else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+      {
+        long k = int(r / (irange*jrange));
+        long j = int(( r - k*irange*jrange)/irange);
+        long i = int(r - k*irange*jrange - j*irange);
+        input(i,j,k) = 1;
+      }
+    }
+  };
+
+
+  static double test_collapse_all(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
+  {
+    //This test refers to collapsing all dims using the RangePolicy
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef RangePolicyCollapseAll<execution_space,ScalarType,TestLayout> FunctorType;
+
+    const long flat_index_range = icount*jcount*kcount;
+    Kokkos::RangePolicy<execution_space> policy(0, flat_index_range );
+    Kokkos::RangePolicy<execution_space> policy_initB(0, (icount+2)*(jcount+2)*(kcount+2) );
+
+    double dt_min = 0;
+
+    Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
+    execution_space::fence();
+    Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
+    execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - first iteration only
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Callapse ALL Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  flat Ahost = " << Ahost(l,j,k) << "  expected = " << check  << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " RP collapse all: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " RP collapse all: Pass! " << std::endl; }
+      }
+    }
+
+    return dt_min;
+  } 
+
+};
+
+} //end namespace Test
--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@ -92,13 +92,13 @@ LIST(APPEND SOURCES         ${SOURCES_CUDA} )
 INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/)

 #-----------------------------------------------------------------------------
-FILE(GLOB HEADERS_QTHREAD Qthread/*.hpp)
-FILE(GLOB SOURCES_QTHREAD Qthread/*.cpp)
+FILE(GLOB HEADERS_QTHREADS Qthreads/*.hpp)
+FILE(GLOB SOURCES_QTHREADS Qthreads/*.cpp)

-LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREAD} )
-LIST(APPEND SOURCES         ${SOURCES_QTHREAD} )
+LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREADS} )
+LIST(APPEND SOURCES         ${SOURCES_QTHREADS} )

-INSTALL(FILES ${HEADERS_QTHREAD} DESTINATION ${TRILINOS_INCDIR}/Qthread/)
+INSTALL(FILES ${HEADERS_QTHREADS} DESTINATION ${TRILINOS_INCDIR}/Qthreads/)

 #-----------------------------------------------------------------------------

@ -109,5 +109,3 @@ TRIBITS_ADD_LIBRARY(
    SOURCES ${SOURCES}
    DEPLIBS
    )
-
-
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -131,6 +131,7 @@ namespace Impl {
    int* atomic;
    int* scratch;
    int* threadid;
+    int n;
  };
 }
 }
@ -250,6 +251,7 @@ struct CudaParallelLaunch< DriverType , true > {
      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      locks.n = Kokkos::Cuda::concurrency();
      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
      #endif

@ -292,6 +294,7 @@ struct CudaParallelLaunch< DriverType , false > {
      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      locks.n = Kokkos::Cuda::concurrency();
      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
      #endif

--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -59,7 +59,7 @@
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <impl/Kokkos_Error.hpp>

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif

@ -375,7 +375,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::CudaSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {

    SharedAllocationHeader header ;
@ -395,7 +395,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::
 SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::fence(); //Make sure I can access the label ...
    Kokkos::Profiling::deallocateData(
@ -412,7 +412,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
 SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::deallocateData(
      Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
@ -442,7 +442,7 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
  , m_tex_obj( 0 )
  , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
  }
@ -479,7 +479,7 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
  , m_tex_obj( 0 )
  , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
  }
@ -510,7 +510,7 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
      )
  , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
  }
@ -883,6 +883,7 @@ void init_lock_arrays_cuda_space() {
    locks.atomic = atomic_lock_array_cuda_space_ptr(false);
    locks.scratch = scratch_lock_array_cuda_space_ptr(false);
    locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+    locks.n = Kokkos::Cuda::concurrency();
    cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
    init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
    init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -536,6 +536,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
  locks.atomic = atomic_lock_array_cuda_space_ptr(false);
  locks.scratch = scratch_lock_array_cuda_space_ptr(false);
  locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+  locks.n = Kokkos::Cuda::concurrency();
  cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
  #endif
 }
@ -620,9 +621,9 @@ void CudaInternal::finalize()
  was_finalized = 1;
  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {

-    atomic_lock_array_cuda_space_ptr(false);
-    scratch_lock_array_cuda_space_ptr(false);
-    threadid_lock_array_cuda_space_ptr(false);
+    atomic_lock_array_cuda_space_ptr(true);
+    scratch_lock_array_cuda_space_ptr(true);
+    threadid_lock_array_cuda_space_ptr(true);

    if ( m_stream ) {
      for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
@ -700,7 +701,7 @@ void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
 {
  Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );

-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
  #endif
 }
@ -739,7 +740,7 @@ void Cuda::finalize()
 {
  Impl::CudaInternal::singleton().finalize();

-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
  #endif
 }
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@ -61,7 +61,7 @@
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <Kokkos_Vectorization.hpp>

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <typeinfo>
 #endif
@ -586,13 +586,35 @@ public:
  void operator()(void) const
  {
    // Iterate this block through the league
+    int threadid = 0;
+    if ( m_scratch_size[1]>0 ) {
+      __shared__ int base_thread_id;
+      if (threadIdx.x==0 && threadIdx.y==0 ) {
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
+        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        int done = 0;
+        while (!done) {
+          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          if(!done) {
+            threadid += blockDim.x * blockDim.y;
+            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+          }
+        }
+        base_thread_id = threadid;
+      }
+      __syncthreads();
+      threadid = base_thread_id;
+    }
+
+
    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {

      this-> template exec_team< WorkTag >(
        typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
                                    , m_shmem_begin
                                    , m_shmem_size
-                                    , m_scratch_ptr[1]
+                                    , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
                                    , m_scratch_size[1]
                                    , league_rank
                                    , m_league_size ) );
@ -946,11 +968,32 @@ public:

  __device__ inline
  void operator() () const {
-    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+    int threadid = 0;
+    if ( m_scratch_size[1]>0 ) {
+      __shared__ int base_thread_id;
+      if (threadIdx.x==0 && threadIdx.y==0 ) {
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
+        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        int done = 0;
+        while (!done) {
+          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          if(!done) {
+            threadid += blockDim.x * blockDim.y;
+            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+          }
+        }
+        base_thread_id = threadid;
+      }
+      __syncthreads();
+      threadid = base_thread_id;
+    }
+
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0), threadid );
  }

  __device__ inline
-  void run(const DummySHMEMReductionType&) const
+  void run(const DummySHMEMReductionType&, const int& threadid) const
  {
    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
@ -964,7 +1007,7 @@ public:
        ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                        , m_shmem_begin
                                        , m_shmem_size
-                                        , m_scratch_ptr[1]
+                                        , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
                                        , m_scratch_size[1]
                                        , league_rank
                                        , m_league_size )
@ -992,7 +1035,7 @@ public:
  }

  __device__ inline
-  void run(const DummyShflReductionType&) const
+  void run(const DummyShflReductionType&, const int& threadid) const
  {
    value_type value;
    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
@ -1003,7 +1046,7 @@ public:
        ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                        , m_shmem_begin
                                        , m_shmem_size
-                                        , m_scratch_ptr[1]
+                                        , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
                                        , m_scratch_size[1]
                                        , league_rank
                                        , m_league_size )
@ -1128,9 +1171,9 @@ public:
      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
    }

-    if ( m_team_size >
-         Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
-               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length()) {
+    if ( unsigned(m_team_size) >
+         unsigned(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
+               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
    }

@ -1621,14 +1664,25 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Cuda
 #endif
 }

-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+/** \brief  Intra-thread vector parallel_reduce.
 *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
+ *  Calls lambda(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed using +=
+ *  and output into result.
+ *
+ *  The identity value for the += operator is assumed to be the default
+ *  constructed value.
+ */
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
+void parallel_reduce
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
+      const & loop_boundaries
+  , Lambda const & lambda
+  , ValueType & result )
+{
 #ifdef __CUDA_ARCH__
  result = ValueType();

@ -1636,52 +1690,42 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::C
    lambda(i,result);
  }

-  if (loop_boundaries.increment > 1)
-    result += shfl_down(result, 1,loop_boundaries.increment);
-  if (loop_boundaries.increment > 2)
-    result += shfl_down(result, 2,loop_boundaries.increment);
-  if (loop_boundaries.increment > 4)
-    result += shfl_down(result, 4,loop_boundaries.increment);
-  if (loop_boundaries.increment > 8)
-    result += shfl_down(result, 8,loop_boundaries.increment);
-  if (loop_boundaries.increment > 16)
-    result += shfl_down(result, 16,loop_boundaries.increment);
+  Impl::cuda_intra_warp_vector_reduce(
+    Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & result ) );

-  result = shfl(result,0,loop_boundaries.increment);
 #endif
 }

-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+/** \brief  Intra-thread vector parallel_reduce.
 *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
+ *  Calls lambda(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed
+ *  using JoinType::operator()(ValueType& val, const ValueType& update)
+ *  and output into result.
+ *
+ *  The input value of result must be the identity value for the
+ *  reduction operation; e.g., ( 0 , += ) or ( 1 , *= ).
+ */
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
+void parallel_reduce
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
+      const & loop_boundaries
+  , Lambda const & lambda
+  , JoinType const & join
+  , ValueType & result )
+{
 #ifdef __CUDA_ARCH__
-  ValueType result = init_result;

  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }

-  if (loop_boundaries.increment > 1)
-    join( result, shfl_down(result, 1,loop_boundaries.increment));
-  if (loop_boundaries.increment > 2)
-    join( result, shfl_down(result, 2,loop_boundaries.increment));
-  if (loop_boundaries.increment > 4)
-    join( result, shfl_down(result, 4,loop_boundaries.increment));
-  if (loop_boundaries.increment > 8)
-    join( result, shfl_down(result, 8,loop_boundaries.increment));
-  if (loop_boundaries.increment > 16)
-    join( result, shfl_down(result, 16,loop_boundaries.increment));
+  Impl::cuda_intra_warp_vector_reduce(
+    Impl::Reducer< ValueType , JoinType >( join , & result ) );

-  init_result = shfl(result,0,loop_boundaries.increment);
 #endif
 }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -55,15 +55,163 @@
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
 namespace Impl {

+//----------------------------------------------------------------------------

+template< typename T >
+__device__ inline
+void cuda_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl( *reinterpret_cast<int const *>(&in) , lane , width );
+}

-//Shfl based reductions
+template< typename T >
+__device__ inline
+void cuda_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl( reinterpret_cast<int const *>(&in)[i] , lane , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+__device__ inline
+void cuda_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl_down( *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl_down( reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+__device__ inline
+void cuda_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl_up( *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl_up( reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Reduce within a warp over blockDim.x, the "vector" dimension.
+ *
+ *  This will be called within a nested, intra-team parallel operation.
+ *  Use shuffle operations to avoid conflicts with shared memory usage.
+ *
+ *  Requires:
+ *    blockDim.x is power of 2
+ *    blockDim.x <= 32 (one warp)
+ *
+ *  Cannot use "butterfly" pattern because floating point
+ *  addition is non-associative.  Therefore, must broadcast
+ *  the final result.
+ */
+template< class Reducer >
+__device__ inline
+void cuda_intra_warp_vector_reduce( Reducer const & reducer )
+{
+  static_assert(
+    std::is_reference< typename Reducer::reference_type >::value , "" );
+
+  if ( 1 < blockDim.x ) {
+
+    typename Reducer::value_type tmp ;
+
+    for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
+
+      cuda_shfl_down( tmp , reducer.reference() , i , blockDim.x );
+
+      if ( threadIdx.x < i ) { reducer.join( reducer.data() , & tmp ); }
+    }
+
+    // Broadcast from root "lane" to all other "lanes"
+
+    cuda_shfl( reducer.reference() , reducer.reference() , 0 , blockDim.x );
+  }
+}
+
+/** \brief  Inclusive scan over blockDim.x, the "vector" dimension.
+ *
+ *  This will be called within a nested, intra-team parallel operation.
+ *  Use shuffle operations to avoid conflicts with shared memory usage.
+ *
+ *  Algorithm is concurrent bottom-up reductions in triangular pattern
+ *  where each CUDA thread is the root of a reduction tree from the
+ *  zeroth CUDA thread to itself.
+ *
+ *  Requires:
+ *    blockDim.x is power of 2
+ *    blockDim.x <= 32 (one warp)
+ */
+template< typename ValueType >
+__device__ inline
+void cuda_intra_warp_vector_inclusive_scan( ValueType & local )
+{
+  ValueType tmp ;
+
+  // Bottom up:
+  //   [t] += [t-1] if t >= 1
+  //   [t] += [t-2] if t >= 2
+  //   [t] += [t-4] if t >= 4
+  // ...
+
+  for ( int i = 1 ; i < blockDim.x ; i <<= 1 ) {
+
+    cuda_shfl_up( tmp , local , i , blockDim.x );
+
+    if ( i <= threadIdx.x ) { local += tmp ; }
+  }
+}
+
+//----------------------------------------------------------------------------
 /*
 *  Algorithmic constraints:
 *   (a) threads with same threadIdx.y have same value
@ -98,7 +246,10 @@ inline void cuda_inter_warp_reduction( ValueType& value,
                                       const int max_active_thread = blockDim.y) {

  #define STEP_WIDTH 4
-  __shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
+  // Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
+  // The reason not to use ValueType directly is that for types with constructors it 
+  // could lead to race conditions
+  __shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
  ValueType* result = (ValueType*) & sh_result;
  const unsigned step = 32 / blockDim.x;
  unsigned shift = STEP_WIDTH;
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@ -91,7 +91,7 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
      // Loop by priority and then type
      for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
        for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
-          task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
+          task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
        }
      }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@ -61,6 +61,8 @@ void set_cuda_task_base_apply_function_pointer

 }

+template< class > class TaskExec ;
+
 template<>
 class TaskQueueSpecialization< Kokkos::Cuda >
 {
@ -69,6 +71,7 @@ public:
  using execution_space = Kokkos::Cuda ;
  using memory_space    = Kokkos::CudaUVMSpace ;
  using queue_type      = TaskQueue< execution_space > ;
+  using member_type     = TaskExec< Kokkos::Cuda > ;

  static
  void iff_single_thread_recursive_execute( queue_type * const ) {}
@ -79,13 +82,15 @@ public:
  static
  void execute( queue_type * const );

-  template< typename FunctorType >
+  template< typename TaskType >
  static
-  void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
+  typename TaskType::function_type
+  get_function_pointer()
    {
-      using TaskType = TaskBase< execution_space
-                               , typename FunctorType::value_type
-                               , FunctorType > ;
+      using function_type = typename TaskType::function_type ;
+
+      function_type * const ptr =
+        (function_type*) cuda_internal_scratch_unified( sizeof(function_type) );

      CUDA_SAFE_CALL( cudaDeviceSynchronize() );

@ -93,6 +98,8 @@ public:

      CUDA_SAFE_CALL( cudaGetLastError() );
      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+      return *ptr ;
    }
 };

@ -435,18 +442,26 @@ void parallel_reduce
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename ValueType, typename iType, class Lambda >
+template< typename iType, class Closure >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
-   const Lambda & lambda) {
+   const Closure & closure )
+{
+  // Extract value_type from closure

-  ValueType accum = 0 ;
-  ValueType val, y, local_total;
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  value_type accum = 0 ;
+  value_type val, y, local_total;

  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    val = 0;
-    lambda(i,val,false);
+    closure(i,val,false);

    // intra-blockDim.y exclusive scan on 'val'
    // accum = accumulated, sum in total for this iteration
@ -458,7 +473,7 @@ void parallel_scan
    }

    // pass accum to all threads
-    local_total = shfl_warp_broadcast<ValueType>(val,
+    local_total = shfl_warp_broadcast<value_type>(val,
                                            threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
                                            Impl::CudaTraits::WarpSize);

@ -467,7 +482,7 @@ void parallel_scan
    if ( threadIdx.y == 0 ) { val = 0 ; }

    val += accum;
-    lambda(i,val,true);
+    closure(i,val,true);
    accum += local_total;
  }
 }
@ -478,18 +493,26 @@ void parallel_scan
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename iType, class Lambda, typename ValueType >
+template< typename iType, class Closure >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
-   const Lambda & lambda)
+   const Closure & closure )
 {
-  ValueType accum = 0 ;
-  ValueType val, y, local_total;
+  // Extract value_type from closure
+
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  value_type accum = 0 ;
+  value_type val, y, local_total;

  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    val = 0;
-    lambda(i,val,false);
+    closure(i,val,false);

    // intra-blockDim.x exclusive scan on 'val'
    // accum = accumulated, sum in total for this iteration
@ -501,14 +524,14 @@ void parallel_scan
    }

    // pass accum to all threads
-    local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
+    local_total = shfl_warp_broadcast<value_type>(val, blockDim.x-1, blockDim.x);

    // make EXCLUSIVE scan by shifting values over one
    val = Kokkos::shfl_up(val, 1, blockDim.x);
    if ( threadIdx.x == 0 ) { val = 0 ; }

    val += accum;
-    lambda(i,val,true);
+    closure(i,val,true);
    accum += local_total;
  }
 }
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@ -44,36 +44,47 @@
 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP

-#include <Kokkos_ExecPolicy.hpp>
-#include <Kokkos_Parallel.hpp>
 #include <initializer_list>

-#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_ENABLE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_MDRANGE_IVDEP
+#include<impl/KokkosExp_Host_IterateTile.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+#include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
 #endif

 namespace Kokkos { namespace Experimental {

+// ------------------------------------------------------------------ //
+
 enum class Iterate
 {
  Default, // Default for the device
  Left,    // Left indices stride fastest
  Right,   // Right indices stride fastest
-  Flat,    // Do not tile, only valid for inner direction
 };

 template <typename ExecSpace>
 struct default_outer_direction
 {
  using type = Iterate;
+  #if defined( KOKKOS_ENABLE_CUDA)
+  static constexpr Iterate value = Iterate::Left;
+  #else
  static constexpr Iterate value = Iterate::Right;
+  #endif
 };

 template <typename ExecSpace>
 struct default_inner_direction
 {
  using type = Iterate;
+  #if defined( KOKKOS_ENABLE_CUDA)
+  static constexpr Iterate value = Iterate::Left;
+  #else
  static constexpr Iterate value = Iterate::Right;
+  #endif
 };


@ -86,7 +97,7 @@ struct Rank
 {
  static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
  static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
-  static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
+  static_assert( N < 7u, "Kokkos Error: Unsupported rank...");

  using iteration_pattern = Rank<N, OuterDir, InnerDir>;

@ -96,498 +107,236 @@ struct Rank
 };


-
 // multi-dimensional iteration pattern
 template <typename... Properties>
 struct MDRangePolicy
+  : public Kokkos::Impl::PolicyTraits<Properties ...>
 {
+  using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
  using range_policy = RangePolicy<Properties...>;

-  static_assert( !std::is_same<range_policy,void>::value
+  using impl_range_policy = RangePolicy< typename traits::execution_space
+                                       , typename traits::schedule_type
+                                       , typename traits::index_type
+                                       > ;
+
+  static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
               , "Kokkos Error: MD iteration pattern not defined" );

-  using iteration_pattern   = typename range_policy::iteration_pattern;
-  using work_tag            = typename range_policy::work_tag;
+  using iteration_pattern   = typename traits::iteration_pattern;
+  using work_tag            = typename traits::work_tag;

  static constexpr int rank = iteration_pattern::rank;

  static constexpr int outer_direction = static_cast<int> (
-      (iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
+      (iteration_pattern::outer_direction != Iterate::Default)
    ? iteration_pattern::outer_direction
-    : default_outer_direction< typename range_policy::execution_space>::value );
+    : default_outer_direction< typename traits::execution_space>::value );

  static constexpr int inner_direction = static_cast<int> (
      iteration_pattern::inner_direction != Iterate::Default
    ? iteration_pattern::inner_direction
-    : default_inner_direction< typename range_policy::execution_space>::value ) ;
+    : default_inner_direction< typename traits::execution_space>::value ) ;


  // Ugly ugly workaround intel 14 not handling scoped enum correctly
-  static constexpr int Flat = static_cast<int>( Iterate::Flat );
  static constexpr int Right = static_cast<int>( Iterate::Right );
+  static constexpr int Left  = static_cast<int>( Iterate::Left );

+  using index_type  = typename traits::index_type;
+  using array_index_type = long;
+  using point_type  = Kokkos::Array<array_index_type,rank>; //was index_type
+  using tile_type   = Kokkos::Array<array_index_type,rank>;
+  // If point_type or tile_type is not templated on a signed integral type (if it is unsigned), 
+  // then if user passes in intializer_list of runtime-determined values of 
+  // signed integral type that are not const will receive a compiler error due 
+  // to an invalid case for implicit conversion - 
+  // "conversion from integer or unscoped enumeration type to integer type that cannot represent all values of the original, except where source is a constant expression whose value can be stored exactly in the target type"
+  // This would require the user to either pass a matching index_type parameter
+  // as template parameter to the MDRangePolicy or static_cast the individual values

-  using size_type   = typename range_policy::index_type;
-  using index_type  = typename std::make_signed<size_type>::type;
-
-
-  template <typename I>
-  MDRangePolicy( std::initializer_list<I> upper_corner )
+  MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
+    : m_lower(lower)
+    , m_upper(upper)
+    , m_tile(tile)
+    , m_num_tiles(1)
  {
-    static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
-
-    // TODO check size of lists equal to rank
-    // static_asserts on initializer_list.size() require c++14
-
-    //static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
-
-    const auto u = upper_corner.begin();
-
-    m_num_tiles = 1;
-    for (int i=0; i<rank; ++i) {
-      m_offset[i] = static_cast<index_type>(0);
-      m_dim[i]    = static_cast<index_type>(u[i]);
-      if (inner_direction != Flat) {
-        // default tile size to 4
-        m_tile[i] = 4;
-      } else {
-        m_tile[i] = 1;
-      }
-      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
-      m_num_tiles *= m_tile_dim[i];
-    }
-  }
-
-  template <typename IA, typename IB>
-  MDRangePolicy( std::initializer_list<IA> corner_a
-               , std::initializer_list<IB> corner_b
+    // Host
+    if ( true
+       #if defined(KOKKOS_ENABLE_CUDA)
+         && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
+       #endif
       )
    {
-    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
-    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
-
-    // TODO check size of lists equal to rank
-    // static_asserts on initializer_list.size() require c++14
-    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
-    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = upper[i] - lower[i];
+        if ( m_tile[i] <= 0 ) {
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = span;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+      }
+    }
+    #if defined(KOKKOS_ENABLE_CUDA)
+    else // Cuda
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = upper[i] - lower[i];
+        if ( m_tile[i] <= 0 ) {
+          // TODO: determine what is a good default tile size for cuda
+          // may be rank dependent
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = 16;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+      }
+      index_type total_tile_size_check = 1;
+      for (int i=0; i<rank; ++i) {
+        total_tile_size_check *= m_tile[i];
+      }
+      if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+        printf(" Tile dimensions exceed Cuda limits\n");
+        Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+      }
+    }
+    #endif
+  }


-    using A = typename std::make_signed<IA>::type;
-    using B = typename std::make_signed<IB>::type;
+  template < typename LT , typename UT , typename TT = array_index_type >
+  MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
+  {
+#if 0
+    // This should work, less duplicated code but not yet extensively tested
+    point_type lower_tmp, upper_tmp;
+    tile_type tile_tmp;
+    for ( auto i = 0; i < rank; ++i ) {
+      lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]);
+      upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]);
+      tile_tmp[i]  = static_cast<array_index_type>(tile.begin()[i]);
+    }

-    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
-    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
+    MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
+
+#else
+    if(m_lower.size()!=rank || m_upper.size() != rank)
+      Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
+
+    for ( auto i = 0; i < rank; ++i ) {
+      m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
+      m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
+      if(tile.size()==rank)
+        m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
+      else
+        m_tile[i] = 0;
+    }

    m_num_tiles = 1;
-    for (int i=0; i<rank; ++i) {
-      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
-      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
-      if (inner_direction != Flat) {
-        // default tile size to 4
-        m_tile[i] = 4;
-      } else {
-        m_tile[i] = 1;
-      }
-      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
-      m_num_tiles *= m_tile_dim[i];
-    }
-  }

-  template <typename IA, typename IB, typename T>
-  MDRangePolicy( std::initializer_list<IA> corner_a
-               , std::initializer_list<IB> corner_b
-               , std::initializer_list<T> tile
+
+    // Host
+    if ( true
+       #if defined(KOKKOS_ENABLE_CUDA)
+         && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
+       #endif
       )
    {
-    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
-    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
-    static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
-    static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
-
-    // TODO check size of lists equal to rank
-    // static_asserts on initializer_list.size() require c++14
-    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
-    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
-    //static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
-
-    using A = typename std::make_signed<IA>::type;
-    using B = typename std::make_signed<IB>::type;
-
-    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
-    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
-    const auto t = tile.begin();
-
-    m_num_tiles = 1;
+      index_type span;
      for (int i=0; i<rank; ++i) {
-      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
-      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
-      m_tile[i]   = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
-      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
-      m_num_tiles *= m_tile_dim[i];
+        span = m_upper[i] - m_lower[i];
+        if ( m_tile[i] <= 0 ) {
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
          }
+          else {
+            m_tile[i] = span;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+      }
+    }
+    #if defined(KOKKOS_ENABLE_CUDA)
+    else // Cuda
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = m_upper[i] - m_lower[i];
+        if ( m_tile[i] <= 0 ) {
+          // TODO: determine what is a good default tile size for cuda
+          // may be rank dependent
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = 16;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+      }
+      index_type total_tile_size_check = 1;
+      for (int i=0; i<rank; ++i) {
+        total_tile_size_check *= m_tile[i];
+      }
+      if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+        printf(" Tile dimensions exceed Cuda limits\n");
+        Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+      }
+    }
+    #endif
+#endif
  }

-  index_type   m_offset[rank];
-  index_type   m_dim[rank];
-  int          m_tile[rank];
-  index_type   m_tile_dim[rank];
-  size_type    m_num_tiles;       // product of tile dims
+
+  point_type m_lower;
+  point_type m_upper;
+  tile_type  m_tile;
+  point_type m_tile_end;
+  index_type m_num_tiles;
 };
+// ------------------------------------------------------------------ //

-namespace Impl {
-
-// Serial, Threads, OpenMP
-// use enable_if to overload for Cuda
-template < typename MDRange, typename Functor, typename Enable = void >
-struct MDForFunctor
-{
-  using work_tag   = typename MDRange::work_tag;
-  using index_type = typename MDRange::index_type;
-  using size_type  = typename MDRange::size_type;
-
-  MDRange m_range;
-  Functor m_func;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange const& range, Functor const& f )
-    : m_range(range)
-    , m_func( f )
-  {}
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange const& range, Functor && f )
-    : m_range(range)
-    , m_func( std::forward<Functor>(f) )
-  {}
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange && range, Functor const& f )
-    : m_range( std::forward<MDRange>(range) )
-    , m_func( f )
-  {}
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange && range, Functor && f )
-    : m_range( std::forward<MDRange>(range) )
-    , m_func( std::forward<Functor>(f) )
-  {}
-
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDForFunctor const& ) = default;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor& operator=( MDForFunctor const& ) = default;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDForFunctor && ) = default;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor& operator=( MDForFunctor && ) = default;
-
-  // Rank-2, Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
-            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
-    } else {
-      m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
-            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
-    }
-  }
-
-  // Rank-2, Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
-            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
-    } else {
-      m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
-            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
-    }
-  }
-
-  // Rank-2, Not Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    index_type t0, t1;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      t0 = t / m_range.m_tile_dim[1];
-      t1 = t % m_range.m_tile_dim[1];
-    } else {
-      t0 = t % m_range.m_tile_dim[0];
-      t1 = t / m_range.m_tile_dim[0];
-    }
-
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i1=b1; i1<e1; ++i1) {
-        m_func( i0, i1 );
-      }}
-    } else {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( i0, i1 );
-      }}
-    }
-  }
-
-  // Rank-2, Not Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    work_tag tag;
-
-    index_type t0, t1;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      t0 = t / m_range.m_tile_dim[1];
-      t1 = t % m_range.m_tile_dim[1];
-    } else {
-      t0 = t % m_range.m_tile_dim[0];
-      t1 = t / m_range.m_tile_dim[0];
-    }
-
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i1=b1; i1<e1; ++i1) {
-        m_func( tag, i0, i1 );
-      }}
-    } else {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( tag, i0, i1 );
-      }}
-    }
-  }
-
-  //---------------------------------------------------------------------------
-
-  // Rank-3, Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-    const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
-    m_func( m_range.m_offset[0] + (  t / tmp_prod )
-          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
-          , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
-          );
-    } else {
-    const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
-    m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
-          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
-          , m_range.m_offset[2] + (  t / tmp_prod )
-          );
-    }
-  }
-
-  // Rank-3, Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
-      m_func( work_tag{}
-            , m_range.m_offset[0] + (  t / tmp_prod )
-            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
-            , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
-            );
-    } else {
-      const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
-      m_func( work_tag{}
-            , m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
-            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
-            , m_range.m_offset[2] + (  t / tmp_prod )
-            );
-    }
-  }
-
-  // Rank-3, Not Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    index_type t0, t1, t2;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
-      t0 = t / tmp_prod;
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
-      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
-    } else {
-      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
-      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
-      t2 = t / tmp_prod;
-    }
-
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i2=b2; i2<e2; ++i2) {
-        m_func( i0, i1, i2 );
-      }}}
-    } else {
-      for (int i2=b2; i2<e2; ++i2) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( i0, i1, i2 );
-      }}}
-    }
-  }
-
-  // Rank-3, Not Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    work_tag tag;
-
-    index_type t0, t1, t2;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
-      t0 = t / tmp_prod;
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
-      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
-    } else {
-      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
-      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
-      t2 = t / tmp_prod;
-    }
-
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i2=b2; i2<e2; ++i2) {
-        m_func( tag, i0, i1, i2 );
-      }}}
-    } else {
-      for (int i2=b2; i2<e2; ++i2) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( tag, i0, i1, i2 );
-      }}}
-    }
-  }
-};
-
-
-
-} // namespace Impl
-
-
-template <typename MDRange, typename Functor>
+// ------------------------------------------------------------------ //
+//md_parallel_for
+// ------------------------------------------------------------------ //
+template <typename MDRange, typename Functor, typename Enable = void>
 void md_parallel_for( MDRange const& range
                    , Functor const& f
                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
                    )
 {
-  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+  Impl::MDFunctor<MDRange, Functor, void> g(range, f);

-  using range_policy = typename MDRange::range_policy;
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;

  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
 }
@ -596,15 +345,132 @@ template <typename MDRange, typename Functor>
 void md_parallel_for( const std::string& str
                    , MDRange const& range
                    , Functor const& f
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
                    )
 {
-  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+  Impl::MDFunctor<MDRange, Functor, void> g(range, f);

-  using range_policy = typename MDRange::range_policy;
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;

  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
 }

+// Cuda specialization
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+template <typename MDRange, typename Functor>
+void md_parallel_for( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  closure.execute();
+}
+
+template <typename MDRange, typename Functor>
+void md_parallel_for( MDRange const& range
+                    , Functor const& f
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  closure.execute();
+}
+#endif
+// ------------------------------------------------------------------ //
+
+// ------------------------------------------------------------------ //
+//md_parallel_reduce
+// ------------------------------------------------------------------ //
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
+
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;
+  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
+}
+
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
+
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;
+
+  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
+}
+
+// Cuda - parallel_reduce not implemented yet
+/*
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
+  closure.execute();
+}
+
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
+  closure.execute();
+}
+*/
+
 }} // namespace Kokkos::Experimental

 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
--- a/lib/kokkos/core/src/Kokkos_Array.hpp
+++ b/lib/kokkos/core/src/Kokkos_Array.hpp
@ -59,8 +59,14 @@ template< class T      = void
        , class Proxy  = void
        >
 struct Array {
-private:
-  T m_elem[N];
+public:
+  /**
+   * The elements of this C array shall not be accessed directly. The data
+   * member has to be declared public to enable aggregate initialization as for
+   * std::array. We mark it as private in the documentation.
+   * @private
+   */
+  T m_internal_implementation_private_member_data[N];
 public:

  typedef T &                                 reference ;
@ -78,25 +84,32 @@ public:
  KOKKOS_INLINE_FUNCTION
  reference operator[]( const iType & i )
    {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
-      return m_elem[i];
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_internal_implementation_private_member_data[i];
    }

  template< typename iType >
  KOKKOS_INLINE_FUNCTION
  const_reference operator[]( const iType & i ) const
    {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
-      return m_elem[i];
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_internal_implementation_private_member_data[i];
    }

-  KOKKOS_INLINE_FUNCTION pointer       data()       { return & m_elem[0] ; }
-  KOKKOS_INLINE_FUNCTION const_pointer data() const { return & m_elem[0] ; }
+  KOKKOS_INLINE_FUNCTION pointer       data()
+    {
+      return & m_internal_implementation_private_member_data[0];
+    }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const
+    {
+      return & m_internal_implementation_private_member_data[0];
+    }

-  ~Array() = default ;
-  Array() = default ;
-  Array( const Array & ) = default ;
-  Array & operator = ( const Array & ) = default ;
+  // Do not default unless move and move-assignment are also defined
+  // ~Array() = default ;
+  // Array() = default ;
+  // Array( const Array & ) = default ;
+  // Array & operator = ( const Array & ) = default ;

  // Some supported compilers are not sufficiently C++11 compliant
  // for default move constructor and move assignment operator.
@ -124,7 +137,7 @@ public:
  KOKKOS_INLINE_FUNCTION
  value_type operator[]( const iType & )
    {
-      static_assert( std::is_integral<iType>::value , "Must be integer argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
      return value_type();
    }

@ -132,7 +145,7 @@ public:
  KOKKOS_INLINE_FUNCTION
  value_type operator[]( const iType & ) const
    {
-      static_assert( std::is_integral<iType>::value , "Must be integer argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
      return value_type();
    }

@ -181,7 +194,7 @@ public:
  KOKKOS_INLINE_FUNCTION
  reference operator[]( const iType & i )
    {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
      return m_elem[i];
    }

@ -189,7 +202,7 @@ public:
  KOKKOS_INLINE_FUNCTION
  const_reference operator[]( const iType & i ) const
    {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
      return m_elem[i];
    }

@ -250,7 +263,7 @@ public:
  KOKKOS_INLINE_FUNCTION
  reference operator[]( const iType & i )
    {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
      return m_elem[i*m_stride];
    }

@ -258,7 +271,7 @@ public:
  KOKKOS_INLINE_FUNCTION
  const_reference operator[]( const iType & i ) const
    {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
      return m_elem[i*m_stride];
    }

--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@ -102,6 +102,7 @@ KOKKOS_IMPL_IS_CONCEPT( memory_traits )
 KOKKOS_IMPL_IS_CONCEPT( execution_space )
 KOKKOS_IMPL_IS_CONCEPT( execution_policy )
 KOKKOS_IMPL_IS_CONCEPT( array_layout )
+KOKKOS_IMPL_IS_CONCEPT( reducer )

 namespace Impl {

--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -57,6 +57,10 @@
 #include <Kokkos_OpenMP.hpp>
 #endif

+#if defined( KOKKOS_ENABLE_QTHREADS )
+#include <Kokkos_Qthreads.hpp>
+#endif
+
 #if defined( KOKKOS_ENABLE_PTHREAD )
 #include <Kokkos_Threads.hpp>
 #endif
@ -76,6 +80,7 @@

 #include <Kokkos_Complex.hpp>

+#include <iosfwd>

 //----------------------------------------------------------------------------

@ -105,6 +110,9 @@ void finalize_all();

 void fence();

+/** \brief Print "Bill of Materials" */
+void print_configuration( std::ostream & , const bool detail = false );
+
 } // namespace Kokkos

 //----------------------------------------------------------------------------
@ -159,4 +167,3 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
 //----------------------------------------------------------------------------

 #endif
-
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@ -63,7 +63,7 @@ namespace Kokkos {

 struct AUTO_t {
  KOKKOS_INLINE_FUNCTION
-  constexpr const AUTO_t & operator()() const { return *this ; }
+  constexpr const AUTO_t & operator()() const { return *this; }
 };

 namespace {
@ -73,46 +73,49 @@ constexpr AUTO_t AUTO = Kokkos::AUTO_t();

 struct InvalidType {};

-}
+} // namespace Kokkos

-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // Forward declarations for class inter-relationships

 namespace Kokkos {

-class HostSpace ; ///< Memory space for main process and CPU execution spaces
+class HostSpace; ///< Memory space for main process and CPU execution spaces

 #ifdef KOKKOS_ENABLE_HBWSPACE
 namespace Experimental {
-class HBWSpace ; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
+class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
 }
 #endif

 #if defined( KOKKOS_ENABLE_SERIAL )
-class Serial ;    ///< Execution space main process on CPU
-#endif // defined( KOKKOS_ENABLE_SERIAL )
+class Serial;    ///< Execution space main process on CPU.
+#endif
+
+#if defined( KOKKOS_ENABLE_QTHREADS )
+class Qthreads;  ///< Execution space with Qthreads back-end.
+#endif

 #if defined( KOKKOS_ENABLE_PTHREAD )
-class Threads ;  ///< Execution space with pthreads back-end
+class Threads;   ///< Execution space with pthreads back-end.
 #endif

 #if defined( KOKKOS_ENABLE_OPENMP )
-class OpenMP ; ///< OpenMP execution space
+class OpenMP;    ///< OpenMP execution space.
 #endif

 #if defined( KOKKOS_ENABLE_CUDA )
-class CudaSpace ;            ///< Memory space on Cuda GPU
-class CudaUVMSpace ;         ///< Memory space on Cuda GPU with UVM
-class CudaHostPinnedSpace ;  ///< Memory space on Host accessible to Cuda GPU
-class Cuda ;                 ///< Execution space for Cuda GPU
+class CudaSpace;            ///< Memory space on Cuda GPU
+class CudaUVMSpace;         ///< Memory space on Cuda GPU with UVM
+class CudaHostPinnedSpace;  ///< Memory space on Host accessible to Cuda GPU
+class Cuda;                 ///< Execution space for Cuda GPU
 #endif

 template<class ExecutionSpace, class MemorySpace>
 struct Device;
+
 } // namespace Kokkos

-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // Set the default execution space.

@ -122,60 +125,66 @@ struct Device;

 namespace Kokkos {

-#if   defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
-  typedef Cuda DefaultExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef OpenMP DefaultExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Threads DefaultExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-  typedef Serial DefaultExecutionSpace ;
+#if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+  typedef Cuda DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Qthreads DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultExecutionSpace;
 #else
-#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
 #endif

-#if defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef OpenMP DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Threads DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-  typedef Serial DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_OPENMP )
-  typedef OpenMP DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_PTHREAD )
-  typedef Threads DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_SERIAL )
-  typedef Serial DefaultHostExecutionSpace ;
+#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultHostExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Qthreads DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_PTHREAD )
+  typedef Threads DefaultHostExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Qthreads DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace;
 #else
-#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
 #endif

 } // namespace Kokkos

-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // Detect the active execution space and define its memory space.
 // This is used to verify whether a running kernel can access
 // a given memory space.

 namespace Kokkos {
+
 namespace Impl {

-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_ENABLE_CUDA)
-typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined( KOKKOS_ENABLE_CUDA )
+typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace;
 #elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-typedef Kokkos::HostSpace  ActiveExecutionMemorySpace ;
+typedef Kokkos::HostSpace  ActiveExecutionMemorySpace;
 #else
-typedef void ActiveExecutionMemorySpace ;
+typedef void ActiveExecutionMemorySpace;
 #endif

-template< class ActiveSpace , class MemorySpace >
+template< class ActiveSpace, class MemorySpace >
 struct VerifyExecutionCanAccessMemorySpace {
  enum {value = 0};
 };

 template< class Space >
-struct VerifyExecutionCanAccessMemorySpace< Space , Space >
+struct VerifyExecutionCanAccessMemorySpace< Space, Space >
 {
  enum {value = 1};
  KOKKOS_INLINE_FUNCTION static void verify(void) {}
@ -183,27 +192,27 @@ struct VerifyExecutionCanAccessMemorySpace< Space , Space >
 };

 } // namespace Impl
+
 } // namespace Kokkos

-#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
+#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE, DATA_PTR ) \
  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
-    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
+    Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify( DATA_PTR )

 #define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
-    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
+    Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify()

 //----------------------------------------------------------------------------
-//----------------------------------------------------------------------------

 namespace Kokkos {
  void fence();
 }

-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
+
 namespace Impl {

 template< class Functor
@ -220,18 +229,18 @@ struct FunctorPolicyExecutionSpace;
 ///
 /// This is an implementation detail of parallel_for.  Users should
 /// skip this and go directly to the nonmember function parallel_for.
-template< class FunctorType , class ExecPolicy , class ExecutionSpace =
-          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-        > class ParallelFor ;
+template< class FunctorType, class ExecPolicy, class ExecutionSpace =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelFor;

 /// \class ParallelReduce
 /// \brief Implementation detail of parallel_reduce.
 ///
 /// This is an implementation detail of parallel_reduce.  Users should
 /// skip this and go directly to the nonmember function parallel_reduce.
-template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
-          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-        > class ParallelReduce ;
+template< class FunctorType, class ExecPolicy, class ReducerType = InvalidType, class ExecutionSpace =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelReduce;

 /// \class ParallelScan
 /// \brief Implementation detail of parallel_scan.
@ -239,10 +248,12 @@ template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType
 /// This is an implementation detail of parallel_scan.  Users should
 /// skip this and go directly to the documentation of the nonmember
 /// template function Kokkos::parallel_scan.
-template< class FunctorType , class ExecPolicy , class ExecutionSapce =
-          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-        > class ParallelScan ;
+template< class FunctorType, class ExecPolicy, class ExecutionSapce =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelScan;
+
+} // namespace Impl
+
+} // namespace Kokkos

-}}
 #endif /* #ifndef KOKKOS_CORE_FWD_HPP */
-
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@ -62,7 +62,6 @@
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>

-#include <KokkosExp_MDRangePolicy.hpp>

 /*--------------------------------------------------------------------------*/

@ -295,6 +294,7 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Cuda/Kokkos_Cuda_Parallel.hpp>
 #include <Cuda/Kokkos_Cuda_Task.hpp>

+#include <KokkosExp_MDRangePolicy.hpp>
 //----------------------------------------------------------------------------

 #endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@ -44,14 +44,16 @@
 #ifndef KOKKOS_HBWSPACE_HPP
 #define KOKKOS_HBWSPACE_HPP

-
 #include <Kokkos_HostSpace.hpp>

 /*--------------------------------------------------------------------------*/
+
 #ifdef KOKKOS_ENABLE_HBWSPACE

 namespace Kokkos {
+
 namespace Experimental {
+
 namespace Impl {

 /// \brief Initialize lock array for arbitrary size atomics.
@ -67,7 +69,7 @@ void init_lock_array_hbw_space();
 /// This function tries to aquire the lock for the hash value derived
 /// from the provided ptr. If the lock is successfully aquired the
 /// function returns true. Otherwise it returns false.
-bool lock_address_hbw_space(void* ptr);
+bool lock_address_hbw_space( void* ptr );

 /// \brief Release lock for the address
 ///
@ -75,13 +77,16 @@ bool lock_address_hbw_space(void* ptr);
 /// from the provided ptr. This function should only be called
 /// after previously successfully aquiring a lock with
 /// lock_address.
-void unlock_address_hbw_space(void* ptr);
+void unlock_address_hbw_space( void* ptr );

 } // namespace Impl
-} // neamspace Experimental
+
+} // namespace Experimental
+
 } // namespace Kokkos

 namespace Kokkos {
+
 namespace Experimental {

 /// \class HBWSpace
@ -91,10 +96,9 @@ namespace Experimental {
 /// memory means the usual CPU-accessible memory.
 class HBWSpace {
 public:
-
  //! Tag this class as a kokkos memory space
-  typedef HBWSpace  memory_space ;
-  typedef size_t     size_type ;
+  typedef HBWSpace  memory_space;
+  typedef size_t     size_type;

  /// \typedef execution_space
  /// \brief Default execution space for this memory space.
@ -103,21 +107,25 @@ public:
  /// useful for things like initializing a View (which happens in
  /// parallel using the View's default execution space).
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_PTHREAD )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_SERIAL )
-  typedef Kokkos::Serial   execution_space ;
+  typedef Kokkos::Serial    execution_space;
 #else
-#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qhreads, or Kokkos::Serial.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
 #endif

  //! This memory space preferred device_type
-  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef Kokkos::Device< execution_space, memory_space > device_type;

  /*--------------------------------*/
  /* Functions unique to the HBWSpace */
@ -129,67 +137,68 @@ public:

  /**\brief  Default memory space instance */
  HBWSpace();
-  HBWSpace( const HBWSpace & rhs ) = default ;
-  HBWSpace & operator = ( const HBWSpace & ) = default ;
-  ~HBWSpace() = default ;
+  HBWSpace( const HBWSpace & rhs ) = default;
+  HBWSpace & operator = ( const HBWSpace & ) = default;
+  ~HBWSpace() = default;

  /**\brief  Non-default memory space instance to choose allocation mechansim, if available */

-  enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
+  enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };

  explicit
  HBWSpace( const AllocationMechanism & );

  /**\brief  Allocate untracked memory in the space */
-  void * allocate( const size_t arg_alloc_size ) const ;
+  void * allocate( const size_t arg_alloc_size ) const;

  /**\brief  Deallocate untracked memory in the space */
  void deallocate( void * const arg_alloc_ptr
-                 , const size_t arg_alloc_size ) const ;
+                 , const size_t arg_alloc_size ) const;

  /**\brief Return Name of the MemorySpace */
  static constexpr const char* name();

 private:

-  AllocationMechanism  m_alloc_mech ;
+  AllocationMechanism  m_alloc_mech;
  static constexpr const char* m_name = "HBW";
-  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
 };

 } // namespace Experimental
+
 } // namespace Kokkos

-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
+
 namespace Impl {

 template<>
-class SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >
-  : public SharedAllocationRecord< void , void >
+class SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >
+  : public SharedAllocationRecord< void, void >
 {
 private:

-  friend Kokkos::Experimental::HBWSpace ;
+  friend Kokkos::Experimental::HBWSpace;

-  typedef SharedAllocationRecord< void , void >  RecordBase ;
+  typedef SharedAllocationRecord< void, void >  RecordBase;

-  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
-  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;

  static void deallocate( RecordBase * );

  /**\brief  Root record for tracked allocations from this HBWSpace instance */
-  static RecordBase s_root_record ;
+  static RecordBase s_root_record;

-  const Kokkos::Experimental::HBWSpace m_space ;
+  const Kokkos::Experimental::HBWSpace m_space;

 protected:

  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default ;
+  SharedAllocationRecord() = default;

  SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
                        , const std::string                    & arg_label
@ -212,9 +221,9 @@ public:
                                   )
    {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+      return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
 #else
-      return (SharedAllocationRecord *) 0 ;
+      return (SharedAllocationRecord *) 0;
 #endif
    }

@ -233,88 +242,93 @@ public:
  static
  void deallocate_tracked( void * const arg_alloc_ptr );

-
  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );

-  static void print_records( std::ostream & , const Kokkos::Experimental::HBWSpace & , bool detail = false );
+  static void print_records( std::ostream &, const Kokkos::Experimental::HBWSpace &, bool detail = false );
 };

 } // namespace Impl
+
 } // namespace Kokkos

-
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
+
 namespace Impl {

-static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::Experimental::HBWSpace >::assignable , "" );
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace >::assignable, "" );

 template<>
-struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace > {
+struct MemorySpaceAccess< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace > {
  enum { assignable = true };
  enum { accessible = true };
  enum { deepcopy   = true };
 };

 template<>
-struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace> {
+struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace > {
  enum { assignable = false };
  enum { accessible = true };
  enum { deepcopy   = true };
 };

-}}
+} // namespace Impl
+
+} // namespace Kokkos

-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
+
 namespace Impl {

-
-template<class ExecutionSpace>
-struct DeepCopy<Experimental::HBWSpace,Experimental::HBWSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< Experimental::HBWSpace, Experimental::HBWSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
  }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
    exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
  }
 };

-template<class ExecutionSpace>
-struct DeepCopy<HostSpace,Experimental::HBWSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< HostSpace, Experimental::HBWSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
  }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
    exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
  }
 };

-template<class ExecutionSpace>
-struct DeepCopy<Experimental::HBWSpace,HostSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< Experimental::HBWSpace, HostSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
  }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
    exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
  }
 };

 } // namespace Impl
+
 } // namespace Kokkos

 namespace Kokkos {
+
 namespace Impl {

 template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace >
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace >
 {
  enum { value = true };
  inline static void verify( void ) { }
@ -322,7 +336,7 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experime
 };

 template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace >
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace >
 {
  enum { value = true };
  inline static void verify( void ) { }
@ -330,8 +344,9 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kok
 };

 } // namespace Impl
+
 } // namespace Kokkos

 #endif
-#endif /* #define KOKKOS_HBWSPACE_HPP */

+#endif // #define KOKKOS_HBWSPACE_HPP
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@ -60,6 +60,7 @@
 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
+
 namespace Impl {

 /// \brief Initialize lock array for arbitrary size atomics.
@ -83,9 +84,10 @@ bool lock_address_host_space(void* ptr);
 /// from the provided ptr. This function should only be called
 /// after previously successfully aquiring a lock with
 /// lock_address.
-void unlock_address_host_space(void* ptr);
+void unlock_address_host_space( void* ptr );

 } // namespace Impl
+
 } // namespace Kokkos

 namespace Kokkos {
@ -97,10 +99,9 @@ namespace Kokkos {
 /// memory means the usual CPU-accessible memory.
 class HostSpace {
 public:
-
  //! Tag this class as a kokkos memory space
-  typedef HostSpace  memory_space ;
-  typedef size_t     size_type ;
+  typedef HostSpace  memory_space;
+  typedef size_t     size_type;

  /// \typedef execution_space
  /// \brief Default execution space for this memory space.
@ -109,21 +110,25 @@ public:
  /// useful for things like initializing a View (which happens in
  /// parallel using the View's default execution space).
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_PTHREAD )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_SERIAL )
-  typedef Kokkos::Serial   execution_space ;
+  typedef Kokkos::Serial    execution_space;
 #else
-#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
 #endif

  //! This memory space preferred device_type
-  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef Kokkos::Device< execution_space, memory_space > device_type;

  /*--------------------------------*/
  /* Functions unique to the HostSpace */
@ -135,61 +140,57 @@ public:

  /**\brief  Default memory space instance */
  HostSpace();
-  HostSpace( HostSpace && rhs ) = default ;
-  HostSpace( const HostSpace & rhs ) = default ;
-  HostSpace & operator = ( HostSpace && ) = default ;
-  HostSpace & operator = ( const HostSpace & ) = default ;
-  ~HostSpace() = default ;
+  HostSpace( HostSpace && rhs ) = default;
+  HostSpace( const HostSpace & rhs ) = default;
+  HostSpace & operator = ( HostSpace && ) = default;
+  HostSpace & operator = ( const HostSpace & ) = default;
+  ~HostSpace() = default;

  /**\brief  Non-default memory space instance to choose allocation mechansim, if available */

-  enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
+  enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };

  explicit
  HostSpace( const AllocationMechanism & );

  /**\brief  Allocate untracked memory in the space */
-  void * allocate( const size_t arg_alloc_size ) const ;
+  void * allocate( const size_t arg_alloc_size ) const;

  /**\brief  Deallocate untracked memory in the space */
  void deallocate( void * const arg_alloc_ptr
-                 , const size_t arg_alloc_size ) const ;
+                 , const size_t arg_alloc_size ) const;

  /**\brief Return Name of the MemorySpace */
  static constexpr const char* name();

 private:
-
-  AllocationMechanism  m_alloc_mech ;
+  AllocationMechanism  m_alloc_mech;
  static constexpr const char* m_name = "Host";
-  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ;
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace, void >;
 };

 } // namespace Kokkos

-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
+
 namespace Impl {

-static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::HostSpace >::assignable , "" );
-
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::HostSpace >::assignable, "" );

 template< typename S >
 struct HostMirror {
 private:
-
  // If input execution space can access HostSpace then keep it.
  // Example: Kokkos::OpenMP can access, Kokkos::Cuda cannot
  enum { keep_exe = Kokkos::Impl::MemorySpaceAccess
-    < typename S::execution_space::memory_space , Kokkos::HostSpace >
-      ::accessible };
+                      < typename S::execution_space::memory_space, Kokkos::HostSpace >::accessible };

  // If HostSpace can access memory space then keep it.
  // Example:  Cannot access Kokkos::CudaSpace, can access Kokkos::CudaUVMSpace
  enum { keep_mem = Kokkos::Impl::MemorySpaceAccess
-    < Kokkos::HostSpace , typename S::memory_space >::accessible };
+                      < Kokkos::HostSpace, typename S::memory_space >::accessible };

 public:

@ -202,42 +203,41 @@ public:
                        , typename S::memory_space >
        , Kokkos::HostSpace
        >::type
-    >::type  Space ;
+    >::type  Space;
 };

 } // namespace Impl
+
 } // namespace Kokkos

-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
+
 namespace Impl {

 template<>
-class SharedAllocationRecord< Kokkos::HostSpace , void >
-  : public SharedAllocationRecord< void , void >
+class SharedAllocationRecord< Kokkos::HostSpace, void >
+  : public SharedAllocationRecord< void, void >
 {
 private:
+  friend Kokkos::HostSpace;

-  friend Kokkos::HostSpace ;
+  typedef SharedAllocationRecord< void, void >  RecordBase;

-  typedef SharedAllocationRecord< void , void >  RecordBase ;
-
-  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
-  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;

  static void deallocate( RecordBase * );

  /**\brief  Root record for tracked allocations from this HostSpace instance */
-  static RecordBase s_root_record ;
+  static RecordBase s_root_record;

-  const Kokkos::HostSpace m_space ;
+  const Kokkos::HostSpace m_space;

 protected:
-
  ~SharedAllocationRecord();
-  SharedAllocationRecord() = default ;
+  SharedAllocationRecord() = default;

  SharedAllocationRecord( const Kokkos::HostSpace        & arg_space
                        , const std::string              & arg_label
@ -260,12 +260,13 @@ public:
                                   )
  {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+    return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
 #else
-      return (SharedAllocationRecord *) 0 ;
+    return (SharedAllocationRecord *) 0;
 #endif
  }
   
+
  /**\brief  Allocate tracked memory in the space */
  static
  void * allocate_tracked( const Kokkos::HostSpace & arg_space
@ -281,37 +282,37 @@ public:
  static
  void deallocate_tracked( void * const arg_alloc_ptr );

-
  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );

-  static void print_records( std::ostream & , const Kokkos::HostSpace & , bool detail = false );
+  static void print_records( std::ostream &, const Kokkos::HostSpace &, bool detail = false );
 };

 } // namespace Impl
+
 } // namespace Kokkos

-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
+
 namespace Impl {

-template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space> struct DeepCopy ;
+template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space > struct DeepCopy;

-template<class ExecutionSpace>
-struct DeepCopy<HostSpace,HostSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< HostSpace, HostSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
  }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
    exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
  }
 };

 } // namespace Impl
+
 } // namespace Kokkos

-
-#endif /* #define KOKKOS_HOSTSPACE_HPP */
-
+#endif // #define KOKKOS_HOSTSPACE_HPP
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -45,22 +45,20 @@
 #define KOKKOS_MACROS_HPP

 //----------------------------------------------------------------------------
-/** Pick up configure/build options via #define macros:
+/** Pick up configure / build options via #define macros:
 *
 *  KOKKOS_ENABLE_CUDA                Kokkos::Cuda execution and memory spaces
 *  KOKKOS_ENABLE_PTHREAD             Kokkos::Threads execution space
- *  KOKKOS_ENABLE_QTHREAD             Kokkos::Qthread execution space
+ *  KOKKOS_ENABLE_QTHREADS            Kokkos::Qthreads execution space
 *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP execution space
- *  KOKKOS_ENABLE_HWLOC               HWLOC library is available
- *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK    insert array bounds checks, is expensive!
- *
- *  KOKKOS_ENABLE_MPI                 negotiate MPI/execution space interactions
- *
- *  KOKKOS_ENABLE_CUDA_UVM             Use CUDA UVM for Cuda memory space
+ *  KOKKOS_ENABLE_HWLOC               HWLOC library is available.
+ *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK  Insert array bounds checks, is expensive!
+ *  KOKKOS_ENABLE_MPI                 Negotiate MPI/execution space interactions.
+ *  KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory space.
 */

 #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
-#include <KokkosCore_config.h>
+  #include <KokkosCore_config.h>
 #endif

 #include <impl/Kokkos_OldMacros.hpp>
@ -86,7 +84,7 @@
 *  KOKKOS_ENABLE_INTEL_ATOMICS
 *  KOKKOS_ENABLE_OPENMP_ATOMICS
 *
- *  A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
+ *  A suite of 'KOKKOS_ENABLE_PRAGMA_...' are defined for internal use.
 *
 *  Macros for marking functions to run in an execution space:
 *
@ -98,64 +96,63 @@
 //----------------------------------------------------------------------------

 #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
+  // Compiling with a CUDA compiler.
+  //
+  //  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
+  //    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
+  //
+  //  When generating device code the __CUDA_ARCH__ macro is defined as:
+  //    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )

-/*  Compiling with a CUDA compiler.
- *
- *  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
- *    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
- *
- *  When generating device code the __CUDA_ARCH__ macro is defined as:
- *    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
- */
+  #include <cuda_runtime.h>
+  #include <cuda.h>

-#include <cuda_runtime.h>
-#include <cuda.h>
+  #if !defined( CUDA_VERSION )
+    #error "#include <cuda.h> did not define CUDA_VERSION."
+  #endif

-#if ! defined( CUDA_VERSION )
-#error "#include <cuda.h> did not define CUDA_VERSION"
-#endif
+  #if ( CUDA_VERSION < 7000 )
+    // CUDA supports C++11 in device code starting with version 7.0.
+    // This includes auto type and device code internal lambdas.
+    #error "Cuda version 7.0 or greater required."
+  #endif

-#if ( CUDA_VERSION < 7000 )
-// CUDA supports C++11 in device code starting with
-// version 7.0. This includes auto type and device code internal
-// lambdas.
-#error "Cuda version 7.0 or greater required"
-#endif
+  #if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
+    // Compiling with CUDA compiler for device code.
+    #error "Cuda device capability >= 3.0 is required."
+  #endif

-#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
-/*  Compiling with CUDA compiler for device code. */
-#error "Cuda device capability >= 3.0 is required"
-#endif
-
-#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
-#if ( CUDA_VERSION < 7050 )
+  #ifdef KOKKOS_ENABLE_CUDA_LAMBDA
+    #if ( CUDA_VERSION < 7050 )
      // CUDA supports C++11 lambdas generated in host code to be given
      // to the device starting with version 7.5. But the release candidate (7.5.6)
-  // still identifies as 7.0
-  #error "Cuda version 7.5 or greater required for host-to-device Lambda support"
-#endif
-#if ( CUDA_VERSION < 8000 ) && defined(__NVCC__)
+      // still identifies as 7.0.
+      #error "Cuda version 7.5 or greater required for host-to-device Lambda support."
+    #endif
+
+    #if ( CUDA_VERSION < 8000 ) && defined( __NVCC__ )
      #define KOKKOS_LAMBDA [=]__device__
-#else
+    #else
      #define KOKKOS_LAMBDA [=]__host__ __device__
+
      #if defined( KOKKOS_ENABLE_CXX1Z )
        #define KOKKOS_CLASS_LAMBDA        [=,*this] __host__ __device__
      #endif
-#endif
-#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
-#endif
-#endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ ) */
+    #endif

+    #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
+  #endif
+#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )

-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
   // Cuda version 8.0 still needs the functor wrapper
-   #if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ ) && defined(__NVCC__)
+   #if /* ( CUDA_VERSION < 8000 ) && */  defined( __NVCC__ )
      #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
   #endif
 #endif

-/*--------------------------------------------------------------------------*/
-/* Language info: C++, CUDA, OPENMP */
+//----------------------------------------------------------------------------
+// Language info: C++, CUDA, OPENMP

 #if defined( KOKKOS_ENABLE_CUDA )
  // Compiling Cuda code to 'ptx'
@ -163,20 +160,17 @@
  #define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
  #define KOKKOS_INLINE_FUNCTION       __device__  __host__  inline
  #define KOKKOS_FUNCTION              __device__  __host__
-#endif /* #if defined( __CUDA_ARCH__ ) */
+#endif // #if defined( __CUDA_ARCH__ )

 #if defined( _OPENMP )
+  //  Compiling with OpenMP.
+  //  The value of _OPENMP is an integer value YYYYMM
+  //  where YYYY and MM are the year and month designation
+  //  of the supported OpenMP API version.
+#endif // #if defined( _OPENMP )

-  /*  Compiling with OpenMP.
-   *  The value of _OPENMP is an integer value YYYYMM
-   *  where YYYY and MM are the year and month designation
-   *  of the supported OpenMP API version.
-   */
-
-#endif /* #if defined( _OPENMP ) */
-
-/*--------------------------------------------------------------------------*/
-/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
+//----------------------------------------------------------------------------
+// Mapping compiler built-ins to KOKKOS_COMPILER_*** macros

 #if defined( __NVCC__ )
  // NVIDIA compiler is being used.
@ -184,29 +178,28 @@
  // Host code is compiled again with another compiler.
  // Device code is compile to 'ptx'.
  #define KOKKOS_COMPILER_NVCC __NVCC__
-
 #else
-#if ! defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-    #if !defined (KOKKOS_ENABLE_CUDA) // Compiling with clang for Cuda does not work with LAMBDAs either
+  #if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+    #if !defined( KOKKOS_ENABLE_CUDA ) // Compiling with clang for Cuda does not work with LAMBDAs either
      // CUDA (including version 6.5) does not support giving lambdas as
      // arguments to global functions. Thus its not currently possible
      // to dispatch lambdas from the host.
      #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
    #endif
  #endif
-#endif /* #if defined( __NVCC__ ) */
+#endif // #if defined( __NVCC__ )

-#if !defined (KOKKOS_LAMBDA)
+#if !defined( KOKKOS_LAMBDA )
  #define KOKKOS_LAMBDA [=]
 #endif

-#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined (KOKKOS_CLASS_LAMBDA)
+#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined( KOKKOS_CLASS_LAMBDA )
  #define KOKKOS_CLASS_LAMBDA [=,*this]
 #endif

-//#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
+//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'.

-/* Intel compiler for host code */
+// Intel compiler for host code.

 #if defined( __INTEL_COMPILER )
  #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
@ -218,7 +211,7 @@
  #define KOKKOS_COMPILER_INTEL __ECC
 #endif

-/* CRAY compiler for host code */
+// CRAY compiler for host code
 #if defined( _CRAYC )
  #define KOKKOS_COMPILER_CRAYC _CRAYC
 #endif
@ -234,38 +227,41 @@
  #define KOKKOS_COMPILER_APPLECC __APPLE_CC__
 #endif

-#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL)
+#if defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL )
  #define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
 #endif

-#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
+#if !defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
  #define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
+
  #if ( 472 > KOKKOS_COMPILER_GNU )
    #error "Compiling with GCC version earlier than 4.7.2 is not supported."
  #endif
 #endif

-#if defined( __PGIC__ ) && ! defined( __GNUC__ )
+#if defined( __PGIC__ ) && !defined( __GNUC__ )
  #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
+
  #if ( 1540 > KOKKOS_COMPILER_PGI )
    #error "Compiling with PGI version earlier than 15.4 is not supported."
  #endif
 #endif

-//#endif /* #if ! defined( __CUDA_ARCH__ ) */
+//#endif // #if !defined( __CUDA_ARCH__ )

-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-/* Intel compiler macros */
+//----------------------------------------------------------------------------
+// Intel compiler macros

 #if defined( KOKKOS_COMPILER_INTEL )
-
  #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
-  #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
  #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
  #define KOKKOS_ENABLE_PRAGMA_VECTOR 1
  #define KOKKOS_ENABLE_PRAGMA_SIMD 1

+  #if ( __INTEL_COMPILER > 1400 )
+    #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
+  #endif
+
  #define KOKKOS_RESTRICT __restrict__

  #ifndef KOKKOS_ALIGN
@ -287,12 +283,13 @@
      #warning "Compiling with Intel version 13.x probably works but is not officially supported. Official minimal version is 14.0."
    #endif
  #endif
-  #if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 )
+
+  #if !defined( KOKKOS_ENABLE_ASM ) && !defined( _WIN32 )
    #define KOKKOS_ENABLE_ASM 1
  #endif

-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
-    #if !defined (_WIN32)
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #if !defined( _WIN32 )
      #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
    #else
      #define KOKKOS_FORCEINLINE_FUNCTION inline
@ -302,192 +299,170 @@
  #if defined( __MIC__ )
    // Compiling for Xeon Phi
  #endif
-
 #endif

-/*--------------------------------------------------------------------------*/
-/* Cray compiler macros */
+//----------------------------------------------------------------------------
+// Cray compiler macros

 #if defined( KOKKOS_COMPILER_CRAYC )
-
-
 #endif

-/*--------------------------------------------------------------------------*/
-/* IBM Compiler macros */
+//----------------------------------------------------------------------------
+// IBM Compiler macros

 #if defined( KOKKOS_COMPILER_IBM )
-
  #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
  //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
  //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
  //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
  //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
-
 #endif

-/*--------------------------------------------------------------------------*/
-/* CLANG compiler macros */
+//----------------------------------------------------------------------------
+// CLANG compiler macros

 #if defined( KOKKOS_COMPILER_CLANG )
-
  //#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
  //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
  //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
  //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
  //#define KOKKOS_ENABLE_PRAGMA_SIMD 1

-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
    #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
  #endif
-
 #endif

-/*--------------------------------------------------------------------------*/
-/* GNU Compiler macros */
+//----------------------------------------------------------------------------
+// GNU Compiler macros

 #if defined( KOKKOS_COMPILER_GNU )
-
  //#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
  //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
  //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
  //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
  //#define KOKKOS_ENABLE_PRAGMA_SIMD 1

-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
    #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
  #endif

-  #if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( __PGIC__ ) && \
-      ( defined( __amd64 ) || \
-        defined( __amd64__ ) || \
-        defined( __x86_64 ) || \
-        defined( __x86_64__ ) )
+  #if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \
+      ( defined( __amd64 ) || defined( __amd64__ ) || \
+        defined( __x86_64 ) || defined( __x86_64__ ) )
    #define KOKKOS_ENABLE_ASM 1
  #endif
-
 #endif

-/*--------------------------------------------------------------------------*/
+//----------------------------------------------------------------------------

 #if defined( KOKKOS_COMPILER_PGI )
-
  #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
  #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
  //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
  #define KOKKOS_ENABLE_PRAGMA_VECTOR 1
  //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
-
 #endif

-/*--------------------------------------------------------------------------*/
+//----------------------------------------------------------------------------

 #if defined( KOKKOS_COMPILER_NVCC )
-
-  #if defined(__CUDA_ARCH__ )
+  #if defined( __CUDA_ARCH__ )
    #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
  #endif
-
 #endif

 //----------------------------------------------------------------------------
-/** Define function marking macros if compiler specific macros are undefined: */
+// Define function marking macros if compiler specific macros are undefined:

-#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
-#define KOKKOS_FORCEINLINE_FUNCTION  inline
+#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
+  #define KOKKOS_FORCEINLINE_FUNCTION  inline
 #endif

-#if ! defined( KOKKOS_INLINE_FUNCTION )
-#define KOKKOS_INLINE_FUNCTION  inline
+#if !defined( KOKKOS_INLINE_FUNCTION )
+  #define KOKKOS_INLINE_FUNCTION  inline
 #endif

-#if ! defined( KOKKOS_FUNCTION )
-#define KOKKOS_FUNCTION /**/
-#endif
-
-
-//----------------------------------------------------------------------------
-///** Define empty macro for restrict if necessary: */
-
-#if ! defined(KOKKOS_RESTRICT)
-#define KOKKOS_RESTRICT
+#if !defined( KOKKOS_FUNCTION )
+  #define KOKKOS_FUNCTION /**/
 #endif

 //----------------------------------------------------------------------------
-/** Define Macro for alignment: */
-#if ! defined KOKKOS_ALIGN_SIZE
-#define KOKKOS_ALIGN_SIZE 16
-#endif
+// Define empty macro for restrict if necessary:

-#if ! defined(KOKKOS_ALIGN)
-#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
-#endif
-
-#if ! defined(KOKKOS_ALIGN_PTR)
-#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
+#if !defined( KOKKOS_RESTRICT )
+  #define KOKKOS_RESTRICT
 #endif

 //----------------------------------------------------------------------------
-/** Determine the default execution space for parallel dispatch.
- *  There is zero or one default execution space specified.
- */
-
-#if 1 < ( ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
-
-#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
+// Define Macro for alignment:

+#if !defined KOKKOS_ALIGN_SIZE
+  #define KOKKOS_ALIGN_SIZE 16
 #endif

-/** If default is not specified then chose from enabled execution spaces.
- *  Priority: CUDA, OPENMP, THREADS, SERIAL
- */
-#if   defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-#elif defined ( KOKKOS_ENABLE_CUDA )
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
-#elif defined ( KOKKOS_ENABLE_OPENMP )
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
-#elif defined ( KOKKOS_ENABLE_PTHREAD )
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+#if !defined( KOKKOS_ALIGN )
+  #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
+#endif
+
+#if !defined( KOKKOS_ALIGN_PTR )
+  #define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
+#endif
+
+//----------------------------------------------------------------------------
+// Determine the default execution space for parallel dispatch.
+// There is zero or one default execution space specified.
+
+#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
+  #error "More than one KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_* specified."
+#endif
+
+// If default is not specified then chose from enabled execution spaces.
+// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
+#if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+#elif defined( KOKKOS_ENABLE_CUDA )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
+#elif defined( KOKKOS_ENABLE_PTHREAD )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
 #else
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
 #endif

 //----------------------------------------------------------------------------
-/** Determine for what space the code is being compiled: */
+// Determine for what space the code is being compiled:

-#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_ENABLE_CUDA)
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA )
+  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
 #else
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
 #endif

-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 #if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
    ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
-#if defined(KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN)
-#define KOKKOS_ENABLE_POSIX_MEMALIGN 1
-#endif
+  #if defined( KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN )
+    #define KOKKOS_ENABLE_POSIX_MEMALIGN 1
+  #endif
 #endif

 //----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-/**Enable Profiling by default**/
+// Enable Profiling by default

 #ifndef KOKKOS_ENABLE_PROFILING
-#define KOKKOS_ENABLE_PROFILING 1
+  #define KOKKOS_ENABLE_PROFILING 1
 #endif

-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_MACROS_HPP */
-
+#endif // #ifndef KOKKOS_MACROS_HPP
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@ -1294,6 +1294,7 @@ public:
  KOKKOS_INLINE_FUNCTION
  size_t get_min_block_size() const { return MIN_BLOCK_SIZE; }

+  KOKKOS_INLINE_FUNCTION
  size_t get_mem_size() const { return m_data_size; }

 private:
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@ -66,7 +66,6 @@
 #include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>

-#include <KokkosExp_MDRangePolicy.hpp>
 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
@ -196,6 +195,7 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMP_Task.hpp>

+#include <KokkosExp_MDRangePolicy.hpp>
 /*--------------------------------------------------------------------------*/

 #endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */
--- a/lib/kokkos/core/src/Kokkos_Pair.hpp
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@ -78,16 +78,14 @@ struct pair
  /// This calls the default constructors of T1 and T2.  It won't
  /// compile if those default constructors are not defined and
  /// public.
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair()
-    : first(), second()
-  {}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair() = default ;

  /// \brief Constructor that takes both elements of the pair.
  ///
  /// This calls the copy constructors of T1 and T2.  It won't compile
  /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair(first_type const& f, second_type const& s)
    : first(f), second(s)
  {}
@ -97,7 +95,7 @@ struct pair
  /// This calls the copy constructors of T1 and T2.  It won't compile
  /// if those copy constructors are not defined and public.
  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair( const pair<U,V> &p)
    : first(p.first), second(p.second)
  {}
@ -107,7 +105,7 @@ struct pair
  /// This calls the copy constructors of T1 and T2.  It won't compile
  /// if those copy constructors are not defined and public.
  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair( const volatile pair<U,V> &p)
    : first(p.first), second(p.second)
  {}
@ -183,7 +181,7 @@ struct pair<T1&, T2&>
  ///
  /// This calls the copy constructors of T1 and T2.  It won't compile
  /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair(first_type f, second_type s)
    : first(f), second(s)
  {}
@ -193,7 +191,7 @@ struct pair<T1&, T2&>
  /// This calls the copy constructors of T1 and T2.  It won't compile
  /// if those copy constructors are not defined and public.
  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair( const pair<U,V> &p)
    : first(p.first), second(p.second)
  {}
@ -247,7 +245,7 @@ struct pair<T1, T2&>
  ///
  /// This calls the copy constructors of T1 and T2.  It won't compile
  /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair(first_type const& f, second_type s)
    : first(f), second(s)
  {}
@ -257,7 +255,7 @@ struct pair<T1, T2&>
  /// This calls the copy constructors of T1 and T2.  It won't compile
  /// if those copy constructors are not defined and public.
  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair( const pair<U,V> &p)
    : first(p.first), second(p.second)
  {}
@ -311,7 +309,7 @@ struct pair<T1&, T2>
  ///
  /// This calls the copy constructors of T1 and T2.  It won't compile
  /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair(first_type f, second_type const& s)
    : first(f), second(s)
  {}
@ -321,7 +319,7 @@ struct pair<T1&, T2>
  /// This calls the copy constructors of T1 and T2.  It won't compile
  /// if those copy constructors are not defined and public.
  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair( const pair<U,V> &p)
    : first(p.first), second(p.second)
  {}
@ -366,31 +364,31 @@ bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)

 //! Inequality operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return !(lhs==rhs); }

 //! Less-than operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }

 //! Less-than-or-equal-to operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return !(rhs<lhs); }

 //! Greater-than operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return rhs<lhs; }

 //! Greater-than-or-equal-to operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return !(lhs<rhs); }

@ -399,7 +397,7 @@ bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 /// This is a "nonmember constructor" for Kokkos::pair.  It works just
 /// like std::make_pair.
 template <class T1,class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 pair<T1,T2> make_pair (T1 x, T2 y)
 { return ( pair<T1,T2>(x,y) ); }

@ -460,23 +458,21 @@ struct pair<T1,void>
  first_type  first;
  enum { second = 0 };

-  KOKKOS_FORCEINLINE_FUNCTION
-  pair()
-    : first()
-  {}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair() = default ;

-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair(const first_type & f)
    : first(f)
  {}

-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair(const first_type & f, int)
    : first(f)
  {}

  template <class U>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
  pair( const pair<U,void> &p)
    : first(p.first)
  {}
@ -495,32 +491,32 @@ struct pair<T1,void>
 //

 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return lhs.first==rhs.first; }

 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return !(lhs==rhs); }

 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return lhs.first<rhs.first; }

 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return !(rhs<lhs); }

 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return rhs<lhs; }

 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return !(lhs<rhs); }

@ -528,3 +524,4 @@ bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)


 #endif //KOKKOS_PAIR_HPP
+
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@ -52,13 +52,14 @@
 #include <Kokkos_View.hpp>
 #include <Kokkos_ExecPolicy.hpp>

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <typeinfo>
 #endif

 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>

 #ifdef KOKKOS_DEBUG
@ -175,7 +176,7 @@ void parallel_for( const ExecPolicy  & policy
                 , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
                 )
 {
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
    uint64_t kpID = 0;
     if(Kokkos::Profiling::profileLibraryLoaded()) {
     	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@ -188,7 +189,7 @@ void parallel_for( const ExecPolicy  & policy

   closure.execute();

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
     if(Kokkos::Profiling::profileLibraryLoaded()) {
        Kokkos::Profiling::endParallelFor(kpID);
     }
@ -207,7 +208,7 @@ void parallel_for( const size_t        work_count
      execution_space ;
  typedef RangePolicy< execution_space > policy ;

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
     if(Kokkos::Profiling::profileLibraryLoaded()) {
  	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@ -220,7 +221,7 @@ void parallel_for( const size_t        work_count

  closure.execute();

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
     if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::endParallelFor(kpID);
     }
@ -417,7 +418,7 @@ void parallel_scan( const ExecutionPolicy & policy
                  , typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
                  )
 {
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
     if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@ -430,7 +431,7 @@ void parallel_scan( const ExecutionPolicy & policy

  closure.execute();

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
     if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::endParallelScan(kpID);
     }
@ -450,7 +451,7 @@ void parallel_scan( const size_t        work_count

  typedef Kokkos::RangePolicy< execution_space > policy ;

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
     if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@ -463,7 +464,7 @@ void parallel_scan( const size_t        work_count

  closure.execute();

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
     if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::endParallelScan(kpID);
     }
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@ -1094,7 +1094,7 @@ namespace Impl {
        const PolicyType& policy,
        const FunctorType& functor,
        ReturnType& return_value) {
-          #if (KOKKOS_ENABLE_PROFILING)
+          #if defined(KOKKOS_ENABLE_PROFILING)
            uint64_t kpID = 0;
            if(Kokkos::Profiling::profileLibraryLoaded()) {
              Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
@ -1116,7 +1116,7 @@ namespace Impl {
          Kokkos::Impl::shared_allocation_tracking_release_and_enable();
          closure.execute();

-          #if (KOKKOS_ENABLE_PROFILING)
+          #if defined(KOKKOS_ENABLE_PROFILING)
            if(Kokkos::Profiling::profileLibraryLoaded()) {
              Kokkos::Profiling::endParallelReduce(kpID);
            }
--- a/lib/kokkos/core/src/Kokkos_Qthreads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Qthreads.hpp
@ -41,52 +41,70 @@
 //@HEADER
 */

-#ifndef KOKKOS_QTHREAD_HPP
-#define KOKKOS_QTHREAD_HPP
+#ifndef KOKKOS_QTHREADS_HPP
+#define KOKKOS_QTHREADS_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#ifdef KOKKOS_ENABLE_QTHREADS
+
+// Defines to enable experimental Qthreads functionality.
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread.h>

 #include <cstddef>
 #include <iosfwd>
-#include <Kokkos_Core.hpp>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_MemoryTraits.hpp>
+
 #include <Kokkos_HostSpace.hpp>
-#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+//#include <Kokkos_MemoryTraits.hpp>
+//#include <Kokkos_ExecPolicy.hpp>
+//#include <Kokkos_TaskScheduler.hpp> // Uncomment when Tasking working.
+#include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>
+#include <KokkosExp_MDRangePolicy.hpp>

 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
+
 namespace Impl {
-class QthreadExec ;
+
+class QthreadsExec;
+
 } // namespace Impl
+
 } // namespace Kokkos

 /*--------------------------------------------------------------------------*/

 namespace Kokkos {

-/** \brief  Execution space supported by Qthread */
-class Qthread {
+/** \brief  Execution space supported by Qthreads */
+class Qthreads {
 public:
  //! \name Type declarations that all Kokkos devices must provide.
  //@{

  //! Tag this class as an execution space
-  typedef Qthread                  execution_space ;
-  typedef Kokkos::HostSpace        memory_space ;
+  typedef Qthreads                 execution_space;
+  typedef Kokkos::HostSpace        memory_space;
  //! This execution space preferred device_type
-  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef Kokkos::Device< execution_space, memory_space > device_type;

-  typedef Kokkos::LayoutRight      array_layout ;
-  typedef memory_space::size_type  size_type ;
+  typedef Kokkos::LayoutRight      array_layout;
+  typedef memory_space::size_type  size_type;

-  typedef ScratchMemorySpace< Qthread > scratch_memory_space ;
+  typedef ScratchMemorySpace< Qthreads > scratch_memory_space;

  //@}
  /*------------------------------------------------------------------------*/

  /** \brief  Initialization will construct one or more instances */
-  static Qthread & instance( int = 0 );
+  static Qthreads & instance( int = 0 );

  /** \brief  Set the execution space to a "sleep" state.
   *
@ -128,26 +146,24 @@ public:
  static void finalize();

  /** \brief Print configuration information to the given output stream. */
-  static void print_configuration( std::ostream & , const bool detail = false );
+  static void print_configuration( std::ostream &, const bool detail = false );

-  int shepherd_size() const ;
-  int shepherd_worker_size() const ;
+  int shepherd_size() const;
+  int shepherd_worker_size() const;
 };

-/*--------------------------------------------------------------------------*/
-
 } // namespace Kokkos

-/*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
+
 namespace Impl {

 template<>
 struct MemorySpaceAccess
-  < Kokkos::Qthread::memory_space
-  , Kokkos::Qthread::scratch_memory_space
+  < Kokkos::Qthreads::memory_space
+  , Kokkos::Qthreads::scratch_memory_space
  >
 {
  enum { assignable = false };
@ -157,27 +173,26 @@ struct MemorySpaceAccess

 template<>
 struct VerifyExecutionCanAccessMemorySpace
-  < Kokkos::Qthread::memory_space
-  , Kokkos::Qthread::scratch_memory_space
+  < Kokkos::Qthreads::memory_space
+  , Kokkos::Qthreads::scratch_memory_space
  >
 {
  enum { value = true };
-  inline static void verify( void ) { }
-  inline static void verify( const void * ) { }
+  inline static void verify( void ) {}
+  inline static void verify( const void * ) {}
 };

 } // namespace Impl
+
 } // namespace Kokkos

-/*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/

-#include <Kokkos_Parallel.hpp>
-#include <Qthread/Kokkos_QthreadExec.hpp>
-#include <Qthread/Kokkos_Qthread_Parallel.hpp>
+#include <Qthreads/Kokkos_QthreadsExec.hpp>
+#include <Qthreads/Kokkos_Qthreads_Parallel.hpp>
+//#include <Qthreads/Kokkos_Qthreads_Task.hpp> // Uncomment when Tasking working.
+//#include <Qthreads/Kokkos_Qthreads_TaskQueue.hpp> // Uncomment when Tasking working.

-#endif /* #define KOKKOS_QTHREAD_HPP */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
+#endif // #define KOKKOS_ENABLE_QTHREADS

+#endif // #define KOKKOS_QTHREADS_HPP
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -56,6 +56,8 @@
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>

@ -138,30 +140,15 @@ public:
  static void initialize( unsigned threads_count = 1 ,
                          unsigned use_numa_count = 0 ,
                          unsigned use_cores_per_numa = 0 ,
-                          bool allow_asynchronous_threadpool = false) {
-    (void) threads_count;
-    (void) use_numa_count;
-    (void) use_cores_per_numa;
-    (void) allow_asynchronous_threadpool;
+                          bool allow_asynchronous_threadpool = false);

-    // Init the array of locks used for arbitrarily sized atomics
-    Impl::init_lock_array_host_space();
-    #if (KOKKOS_ENABLE_PROFILING)
-      Kokkos::Profiling::initialize();
-    #endif
-  }
-
-  static int is_initialized() { return 1 ; }
+  static int is_initialized();

  /** \brief  Return the maximum amount of concurrency.  */
  static int concurrency() {return 1;};

  //! Free any resources being consumed by the device.
-  static void finalize() {
-    #if (KOKKOS_ENABLE_PROFILING)
-      Kokkos::Profiling::finalize();
-    #endif
-  }
+  static void finalize();

  //! Print configuration information to the given output stream.
  static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
@ -177,10 +164,6 @@ public:
  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }

  //--------------------------------------------------------------------------
-
-  static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
-
-  //--------------------------------------------------------------------------
 };

 } // namespace Kokkos
@ -213,22 +196,6 @@ struct VerifyExecutionCanAccessMemorySpace
  inline static void verify( const void * ) { }
 };

-namespace SerialImpl {
-
-struct Sentinel {
-
-  void *   m_scratch ;
-  unsigned m_reduce_end ;
-  unsigned m_shared_end ;
-
-  Sentinel();
-  ~Sentinel();
-  static Sentinel & singleton();
-};
-
-inline
-unsigned align( unsigned n );
-}
 } // namespace Impl
 } // namespace Kokkos

@ -238,89 +205,26 @@ unsigned align( unsigned n );
 namespace Kokkos {
 namespace Impl {

-class SerialTeamMember {
-private:
-  typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
-  const scratch_memory_space  m_space ;
-  const int                   m_league_rank ;
-  const int                   m_league_size ;
+// Resize thread team data scratch memory
+void serial_resize_thread_team_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes );

-  SerialTeamMember & operator = ( const SerialTeamMember & );
+HostThreadTeamData * serial_get_thread_team_data();

-public:
+} /* namespace Impl */
+} /* namespace Kokkos */

-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_shmem() const { return m_space ; }

-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_scratch(int) const
-    { return m_space ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & thread_scratch(int) const
-    { return m_space ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
-
-  template<class ValueType>
-  KOKKOS_INLINE_FUNCTION
-  void team_broadcast(const ValueType& , const int& ) const {}
-
-  template< class ValueType, class JoinOp >
-  KOKKOS_INLINE_FUNCTION
-  ValueType team_reduce( const ValueType & value , const JoinOp & ) const
-    {
-      return value ;
-    }
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
-    {
-      const Type tmp = global_accum ? *global_accum : Type(0) ;
-      if ( global_accum ) { *global_accum += value ; }
-      return tmp ;
-    }
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
-    { return Type(0); }
-
-  //----------------------------------------
-  // Execution space specific:
-
-  SerialTeamMember( int arg_league_rank
-                  , int arg_league_size
-                  , int arg_shared_size
-                  );
-};
-
-} // namespace Impl
+namespace Kokkos {
+namespace Impl {

 /*
 * < Kokkos::Serial , WorkArgTag >
 * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
 *
 */
-namespace Impl {
 template< class ... Properties >
 class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<Properties...>
 {
@ -441,14 +345,11 @@ public:
    return p;
  };

-  typedef Impl::SerialTeamMember  member_type ;
+  typedef Impl::HostThreadTeamMember< Kokkos::Serial >  member_type ;
 };
 } /* namespace Impl */
 } /* namespace Kokkos */

-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 /* Parallel patterns for Kokkos::Serial with RangePolicy */
@ -521,11 +422,12 @@ private:
  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
  typedef typename ReducerConditional::type ReducerTypeFwd;

-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;

-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;

  const FunctorType   m_functor ;
  const Policy        m_policy ;
@ -535,34 +437,25 @@ private:
  template< class TagType >
  inline
  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
    {
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
-
      const typename Policy::member_type e = m_policy.end();
      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
        m_functor( i , update );
      }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }

  template< class TagType >
  inline
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
    {
      const TagType t{} ;
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );

      const typename Policy::member_type e = m_policy.end();
      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
        m_functor( t , i , update );
      }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }

 public:
@ -570,10 +463,29 @@ public:
  inline
  void execute() const
    {
-      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks

-      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> template exec< WorkTag >( update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }

  template< class HostViewType >
@ -587,7 +499,7 @@ public:
    : m_functor( arg_functor )
    , m_policy( arg_policy )
    , m_reducer( InvalidType() )
-    , m_result_ptr( arg_result_view.ptr_on_device() )
+    , m_result_ptr( arg_result_view.data() )
    {
      static_assert( Kokkos::is_view< HostViewType >::value
        , "Kokkos::Serial reduce result must be a View" );
@ -623,11 +535,13 @@ private:

  typedef Kokkos::RangePolicy< Traits ... > Policy ;
  typedef typename Policy::work_tag                                  WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
+
  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;

-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;

  const FunctorType   m_functor ;
  const Policy        m_policy ;
@ -635,10 +549,8 @@ private:
  template< class TagType >
  inline
  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
    {
-      reference_type update = ValueInit::init( m_functor , ptr );
-
      const typename Policy::member_type e = m_policy.end();
      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
        m_functor( i , update , true );
@ -648,11 +560,9 @@ private:
  template< class TagType >
  inline
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
    {
      const TagType t{} ;
-      reference_type update = ValueInit::init( m_functor , ptr );
-
      const typename Policy::member_type e = m_policy.end();
      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
        m_functor( t , i , update , true );
@ -664,9 +574,22 @@ public:
  inline
  void execute() const
    {
-      pointer_type ptr = (pointer_type)
-        Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( m_functor ) , 0 );
-      this-> template exec< WorkTag >( ptr );
+      const size_t pool_reduce_size = Analysis::value_size( m_functor );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      reference_type update =
+        ValueInit::init( m_functor , pointer_type(data.pool_reduce_local()) );
+
+      this-> template exec< WorkTag >( update );
    }

  inline
@ -696,6 +619,8 @@ class ParallelFor< FunctorType
 {
 private:

+  enum { TEAM_REDUCE_SIZE = 512 };
+
  typedef TeamPolicyInternal< Kokkos::Serial , Properties ...> Policy ;
  typedef typename Policy::member_type                       Member ;

@ -706,21 +631,21 @@ private:
  template< class TagType >
  inline
  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec() const
+  exec( HostThreadTeamData & data ) const
    {
      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( Member(ileague,m_league,m_shared) );
+        m_functor( Member(data,ileague,m_league) );
      }
    }

  template< class TagType >
  inline
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec() const
+  exec( HostThreadTeamData & data ) const
    {
      const TagType t{} ;
      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( t , Member(ileague,m_league,m_shared) );
+        m_functor( t , Member(data,ileague,m_league) );
      }
    }

@ -729,15 +654,28 @@ public:
  inline
  void execute() const
    {
-      Kokkos::Serial::scratch_memory_resize( 0 , m_shared );
-      this-> template exec< typename Policy::work_tag >();
+      const size_t pool_reduce_size  = 0 ; // Never shrinks
+      const size_t team_reduce_size  = TEAM_REDUCE_SIZE ;
+      const size_t team_shared_size  = m_shared ;
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      this->template exec< typename Policy::work_tag >( data );
    }

  ParallelFor( const FunctorType & arg_functor
             , const Policy      & arg_policy )
    : m_functor( arg_functor )
    , m_league(  arg_policy.league_size() )
-    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
    { }
 };

@ -752,18 +690,22 @@ class ParallelReduce< FunctorType
 {
 private:

+  enum { TEAM_REDUCE_SIZE = 512 };
+
  typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
  typedef typename Policy::member_type                       Member ;
  typedef typename Policy::work_tag                          WorkTag ;

  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
  typedef typename ReducerConditional::type ReducerTypeFwd;

-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;

-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;

  const FunctorType  m_functor ;
  const int          m_league ;
@ -774,33 +716,23 @@ private:
  template< class TagType >
  inline
  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( HostThreadTeamData & data , reference_type update ) const
    {
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
-
      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( Member(ileague,m_league,m_shared) , update );
+        m_functor( Member(data,ileague,m_league) , update );
      }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }

  template< class TagType >
  inline
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( HostThreadTeamData & data , reference_type update ) const
    {
      const TagType t{} ;

-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
-
      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( t , Member(ileague,m_league,m_shared) , update );
+        m_functor( t , Member(data,ileague,m_league) , update );
      }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }

 public:
@ -808,10 +740,31 @@ public:
  inline
  void execute() const
    {
-      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
+      const size_t pool_reduce_size  =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));

-      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
+      const size_t team_reduce_size  = TEAM_REDUCE_SIZE ;
+      const size_t team_shared_size  = m_shared ;
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> template exec< WorkTag >( data , update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }

  template< class ViewType >
@ -825,8 +778,10 @@ public:
    : m_functor( arg_functor )
    , m_league( arg_policy.league_size() )
    , m_reducer( InvalidType() )
-    , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
+    , m_result_ptr( arg_result.data() )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
    {
      static_assert( Kokkos::is_view< ViewType >::value
        , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
@ -844,7 +799,9 @@ public:
    , m_league(  arg_policy.league_size() )
    , m_reducer( reducer )
    , m_result_ptr(  reducer.result_view().data() )
-  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
  {
  /*static_assert( std::is_same< typename ViewType::memory_space
                          , Kokkos::HostSpace >::value
@ -858,261 +815,6 @@ public:

 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
-/* Nested parallel patterns for Kokkos::Serial with TeamPolicy */
-
-namespace Kokkos {
-namespace Impl {
-
-template<typename iType>
-struct TeamThreadRangeBoundariesStruct<iType,SerialTeamMember> {
-  typedef iType index_type;
-  const iType begin ;
-  const iType end ;
-  enum {increment = 1};
-  const SerialTeamMember& thread;
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_count)
-    : begin(0)
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_begin, const iType & arg_end )
-    : begin( arg_begin )
-    , end(   arg_end)
-    , thread( arg_thread )
-    {}
-};
-
-  template<typename iType>
-  struct ThreadVectorRangeBoundariesStruct<iType,SerialTeamMember> {
-    typedef iType index_type;
-    enum {start = 0};
-    const iType end;
-    enum {increment = 1};
-
-    KOKKOS_INLINE_FUNCTION
-    ThreadVectorRangeBoundariesStruct (const SerialTeamMember& thread, const iType& count):
-      end( count )
-    {}
-  };
-
-} // namespace Impl
-
-template< typename iType >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
-TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, count );
-}
-
-template< typename iType1, typename iType2 >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::SerialTeamMember >
-TeamThreadRange( const Impl::SerialTeamMember& thread, const iType1 & begin, const iType2 & end )
-{
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, iType(begin), iType(end) );
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >
-  ThreadVectorRange(const Impl::SerialTeamMember& thread, const iType& count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) {
-  return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread);
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
-                     const Lambda & lambda, ValueType& result) {
-
-  result = ValueType();
-
-  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-
-  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
-                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-
-  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-
-  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
-}
-
-} //namespace Kokkos
-
-namespace Kokkos {
-/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
- * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-    loop_boundaries, const Lambda& lambda) {
-  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-  #pragma ivdep
-  #endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
-  result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  init_result = result;
-}
-
-/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
- *          for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
- * Depending on the target execution space the operator might be called twice: once with final=false
- * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
- * "i" needs to be added to val no matter whether final==true or not. In a serial execution
- * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
- * to the final sum value over all vector lanes.
- * This functionality requires C++11 support.*/
-template< typename iType, class FunctorType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const FunctorType & lambda) {
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
-  typedef typename ValueTraits::value_type value_type ;
-
-  value_type scan_val = value_type();
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i,scan_val,true);
-  }
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-}
-
-//----------------------------------------------------------------------------

 #include <impl/Kokkos_Serial_Task.hpp>

--- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
@ -82,6 +82,15 @@ class Future ;
 template< typename Space >
 class TaskScheduler ;

+template< typename Space >
+void wait( TaskScheduler< Space > const & );
+
+template< typename Space >
+struct is_scheduler : public std::false_type {};
+
+template< typename Space >
+struct is_scheduler< TaskScheduler< Space > > : public std::true_type {};
+
 } // namespace Kokkos

 #include <impl/Kokkos_TaskQueue.hpp>
@ -109,9 +118,6 @@ namespace Impl {
 template< typename Space , typename ResultType , typename FunctorType >
 class TaskBase ;

-template< typename Space >
-class TaskExec ;
-
 } // namespace Impl
 } // namespace Kokkos

@ -312,6 +318,19 @@ public:
    }
 };

+// Is a Future with the given execution space
+template< typename , typename ExecSpace = void >
+struct is_future : public std::false_type {};
+
+template< typename Arg1 , typename Arg2 , typename ExecSpace >
+struct is_future< Future<Arg1,Arg2> , ExecSpace >
+  : public std::integral_constant
+      < bool ,
+      ( std::is_same< ExecSpace , void >::value ||
+        std::is_same< ExecSpace
+                    , typename Future<Arg1,Arg2>::execution_space >::value )
+      > {};
+
 } // namespace Kokkos

 //----------------------------------------------------------------------------
@ -319,18 +338,59 @@ public:

 namespace Kokkos {

-enum TaskType { TaskTeam   = Impl::TaskBase<void,void,void>::TaskTeam
-              , TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
-
-enum TaskPriority { TaskHighPriority    = 0
-                  , TaskRegularPriority = 1
-                  , TaskLowPriority     = 2 };
-
-template< typename Space >
-void wait( TaskScheduler< Space > const & );
+enum class TaskPriority : int { High    = 0
+                              , Regular = 1
+                              , Low     = 2 };

 } // namespace Kokkos

+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< int TaskEnum , typename DepFutureType >
+struct TaskPolicyData
+{
+  using execution_space = typename DepFutureType::execution_space ;
+  using scheduler_type  = TaskScheduler< execution_space > ;
+
+  enum : int { m_task_type = TaskEnum };
+
+  scheduler_type const * m_scheduler ;
+  DepFutureType  const   m_dependence ;
+  int                    m_priority ;
+
+  TaskPolicyData() = delete ;
+  TaskPolicyData( TaskPolicyData && ) = default ;
+  TaskPolicyData( TaskPolicyData const & ) = default ;
+  TaskPolicyData & operator = ( TaskPolicyData && ) = default ;
+  TaskPolicyData & operator = ( TaskPolicyData const & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicyData( DepFutureType             && arg_future
+                , Kokkos::TaskPriority const & arg_priority )
+    : m_scheduler( 0 )
+    , m_dependence( arg_future )
+    , m_priority( static_cast<int>( arg_priority ) )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicyData( scheduler_type       const & arg_scheduler
+                , Kokkos::TaskPriority const & arg_priority )
+    : m_scheduler( & arg_scheduler )
+    , m_dependence()
+    , m_priority( static_cast<int>( arg_priority ) )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
@ -348,52 +408,13 @@ private:
  queue_type * m_queue ;

  //----------------------------------------
-  // Process optional arguments to spawn and respawn functions
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const ) {}
-
-  // TaskTeam or TaskSingle
-  template< typename ... Options >
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const task
-             , TaskType const & arg
-             , Options const & ... opts )
-    {
-      task->m_task_type = arg ;
-      assign( task , opts ... );
-    }
-
-  // TaskHighPriority or TaskRegularPriority or TaskLowPriority
-  template< typename ... Options >
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const task
-             , TaskPriority const & arg
-             , Options const & ... opts )
-    {
-      task->m_priority = arg ;
-      assign( task , opts ... );
-    }
-
-  // Future for a dependence
-  template< typename A1 , typename A2 , typename ... Options >
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const task
-             , Future< A1 , A2 > const & arg
-             , Options const & ... opts )
-    {
-      task->add_dependence( arg.m_task );
-      assign( task , opts ... );
-    }
-
-  //----------------------------------------

 public:

-  using execution_policy = TaskScheduler ;
  using execution_space  = ExecSpace ;
  using memory_space     = typename queue_type::memory_space ;
-  using member_type      = Kokkos::Impl::TaskExec< ExecSpace > ;
+  using member_type      =
+    typename Kokkos::Impl::TaskQueueSpecialization< ExecSpace >::member_type ;

  KOKKOS_INLINE_FUNCTION
  TaskScheduler() : m_track(), m_queue(0) {}
@ -460,18 +481,13 @@ public:

  //----------------------------------------

-  /**\brief  A task spawns a task with options
-   *
-   *  1) High, Normal, or Low priority
-   *  2) With or without dependence
-   *  3) Team or Serial
-   */
-  template< typename FunctorType , typename ... Options >
-  KOKKOS_FUNCTION
-  Future< typename FunctorType::value_type , ExecSpace >
-  task_spawn( FunctorType const & arg_functor
-            , Options const & ... arg_options
-            ) const
+  template< int TaskEnum , typename DepFutureType , typename FunctorType >
+  KOKKOS_FUNCTION static
+  Kokkos::Future< typename FunctorType::value_type , execution_space >
+  spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+       , typename task_base::function_type                    arg_function
+       , FunctorType                                       && arg_functor
+       )
    {
      using value_type  = typename FunctorType::value_type ;
      using future_type = Future< value_type , execution_space > ;
@ -479,11 +495,21 @@ public:
                                        , value_type
                                        , FunctorType > ;

+      queue_type * const queue =
+        arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : (
+        arg_policy.m_dependence.m_task
+          ? arg_policy.m_dependence.m_task->m_queue
+          : (queue_type*) 0 );
+
+      if ( 0 == queue ) {
+        Kokkos::abort("Kokkos spawn given null Future" );
+      }
+
      //----------------------------------------
      // Give single-thread back-ends an opportunity to clear
      // queue of ready tasks before allocating a new task

-      m_queue->iff_single_thread_recursive_execute();
+      queue->iff_single_thread_recursive_execute();

      //----------------------------------------

@ -491,176 +517,129 @@ public:

      // Allocate task from memory pool
      f.m_task =
-        reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
+        reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type)));

      if ( f.m_task ) {

        // Placement new construction
-        new ( f.m_task ) task_type( arg_functor );
+        // Reference count starts at two:
+        //   +1 for the matching decrement when task is complete
+        //   +1 for the future
+        new ( f.m_task )
+          task_type( arg_function
+                   , queue
+                   , arg_policy.m_dependence.m_task /* dependence */
+                   , 2                              /* reference count */
+                   , int(sizeof(task_type))         /* allocation size */
+                   , int(arg_policy.m_task_type)
+                   , int(arg_policy.m_priority)
+                   , std::move(arg_functor) );

-        // Reference count starts at two
-        // +1 for matching decrement when task is complete
-        // +1 for future
-        f.m_task->m_queue      = m_queue ;
-        f.m_task->m_ref_count  = 2 ;
-        f.m_task->m_alloc_size = sizeof(task_type);
+        // The dependence (if any) is processed immediately
+        // within the schedule function, as such the dependence's
+        // reference count does not need to be incremented for
+        // the assignment.

-        assign( f.m_task , arg_options... );
-
-        // Spawning from within the execution space so the
-        // apply function pointer is guaranteed to be valid
-        f.m_task->m_apply = task_type::apply ;
-
-        m_queue->schedule( f.m_task );
-        // this task may be updated or executed at any moment
+        queue->schedule_runnable( f.m_task );
+        // This task may be updated or executed at any moment,
+        // even during the call to 'schedule'.
      }

      return f ;
    }

-  /**\brief  The host process spawns a task with options
-   *
-   *  1) High, Normal, or Low priority
-   *  2) With or without dependence
-   *  3) Team or Serial
-   */
-  template< typename FunctorType , typename ... Options >
-  inline
-  Future< typename FunctorType::value_type , ExecSpace >
-  host_spawn( FunctorType const & arg_functor
-            , Options const & ... arg_options
-            ) const
+  template< typename FunctorType , typename A1 , typename A2 >
+  KOKKOS_FUNCTION static
+  void
+  respawn( FunctorType         * arg_self
+         , Future<A1,A2> const & arg_dependence
+         , TaskPriority  const & arg_priority
+         )
    {
+      // Precondition: task is in Executing state
+
      using value_type  = typename FunctorType::value_type ;
-      using future_type = Future< value_type , execution_space > ;
      using task_type   = Impl::TaskBase< execution_space
                                        , value_type
                                        , FunctorType > ;

-      if ( m_queue == 0 ) {
-        Kokkos::abort("Kokkos::TaskScheduler not initialized");
-      }
-
-      future_type f ;
-
-      // Allocate task from memory pool
-      f.m_task =
-        reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
-
-      if ( f.m_task ) {
-
-        // Placement new construction
-        new( f.m_task ) task_type( arg_functor );
-
-        // Reference count starts at two:
-        // +1 to match decrement when task completes
-        // +1 for the future
-        f.m_task->m_queue      = m_queue ;
-        f.m_task->m_ref_count  = 2 ;
-        f.m_task->m_alloc_size = sizeof(task_type);
-
-        assign( f.m_task , arg_options... );
-
-        // Potentially spawning outside execution space so the
-        // apply function pointer must be obtained from execution space.
-        // Required for Cuda execution space function pointer.
-        m_queue->template proc_set_apply< FunctorType >( & f.m_task->m_apply );
-
-        m_queue->schedule( f.m_task );
-      }
-      return f ;
+      task_type * const task = static_cast< task_type * >( arg_self );
+
+      task->m_priority = static_cast<int>(arg_priority);
+
+      task->add_dependence( arg_dependence.m_task );
+
+      // Postcondition: task is in Executing-Respawn state
    }

+  //----------------------------------------
  /**\brief  Return a future that is complete
   *         when all input futures are complete.
   */
  template< typename A1 , typename A2 >
-  KOKKOS_FUNCTION
-  Future< ExecSpace >
-  when_all( int narg , Future< A1 , A2 > const * const arg ) const
+  KOKKOS_FUNCTION static
+  Future< execution_space >
+  when_all( Future< A1 , A2 > const arg[] , int narg )
    {
-      static_assert
-        ( std::is_same< execution_space
-                      , typename Future< A1 , A2 >::execution_space
-                      >::value
-        , "Future must have same execution space" );
-
-      using future_type = Future< ExecSpace > ;
-      using task_base   = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
+      using future_type = Future< execution_space > ;
+      using task_base   = Kokkos::Impl::TaskBase< execution_space , void , void > ;

      future_type f ;

+      if ( narg ) {
+
+        queue_type * queue = 0 ;
+
+        for ( int i = 0 ; i < narg ; ++i ) {
+          task_base * const t = arg[i].m_task ;
+          if ( 0 != t ) {
+            // Increment reference count to track subsequent assignment.
+            Kokkos::atomic_increment( &(t->m_ref_count) );
+            if ( queue == 0 ) {
+              queue = t->m_queue ;
+            }
+            else if ( queue != t->m_queue ) {
+              Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
+            }
+          }
+        }
+
+        if ( queue != 0 ) {
+
          size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);

          f.m_task =
-        reinterpret_cast< task_base * >( m_queue->allocate( size ) );
+            reinterpret_cast< task_base * >( queue->allocate( size ) );

          if ( f.m_task ) {

-        new( f.m_task ) task_base();
-
            // Reference count starts at two:
            // +1 to match decrement when task completes
            // +1 for the future
-        f.m_task->m_queue      = m_queue ;
-        f.m_task->m_ref_count  = 2 ;
-        f.m_task->m_alloc_size = size ;
-        f.m_task->m_dep_count  = narg ;
-        f.m_task->m_task_type  = task_base::Aggregate ;
+            new( f.m_task ) task_base( queue
+                                     , 2     /* reference count */
+                                     , size  /* allocation size */
+                                     , narg  /* dependence count */
+                                     );
+
+            // Assign dependences, reference counts were already incremented

            task_base ** const dep = f.m_task->aggregate_dependences();

-        // Assign dependences to increment their reference count
-        // The futures may be destroyed upon returning from this call
-        // so increment reference count to track this assignment.
+            for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; }

-        for ( int i = 0 ; i < narg ; ++i ) {
-          task_base * const t = dep[i] = arg[i].m_task ;
-          if ( 0 != t ) {
-            Kokkos::atomic_increment( &(t->m_ref_count) );
-          }
-        }
-
-        m_queue->schedule( f.m_task );
+            queue->schedule_aggregate( f.m_task );
            // this when_all may be processed at any moment
          }
+        }
+      }

      return f ;
    }

-  /**\brief  An executing task respawns itself with options
-   *
-   *  1) High, Normal, or Low priority
-   *  2) With or without dependence
-   */
-  template< class FunctorType , typename ... Options >
-  KOKKOS_FUNCTION
-  void respawn( FunctorType * task_self
-              , Options const & ... arg_options ) const
-    {
-      using value_type  = typename FunctorType::value_type ;
-      using task_type   = Impl::TaskBase< execution_space
-                                        , value_type
-                                        , FunctorType > ;
-
-      task_type * const task = static_cast< task_type * >( task_self );
-
-      // Reschedule task with no dependences.
-      m_queue->reschedule( task );
-
-      // Dependences, if requested, are added here through parsing the arguments.
-      assign( task , arg_options... );
-    }
-
  //----------------------------------------

-  template< typename S >
-  friend
-  void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
-
-  //----------------------------------------
-
-  inline
+  KOKKOS_INLINE_FUNCTION
  int allocation_capacity() const noexcept
    { return m_queue->m_memory.get_mem_size(); }

@ -676,12 +655,192 @@ public:
  long allocated_task_count_accum() const noexcept
    { return m_queue->m_accum_alloc ; }

+  //----------------------------------------
+
+  template< typename S >
+  friend
+  void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
+
 };

+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+// Construct a TaskTeam execution policy
+
+template< typename T >
+Kokkos::Impl::TaskPolicyData
+  < Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
+  , typename std::conditional< Kokkos::is_future< T >::value , T ,
+    typename Kokkos::Future< typename T::execution_space > >::type
+  >
+KOKKOS_INLINE_FUNCTION
+TaskTeam( T            const & arg
+        , TaskPriority const & arg_priority = TaskPriority::Regular
+        )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos TaskTeam argument must be Future or TaskScheduler" );
+
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
+      , typename std::conditional< Kokkos::is_future< T >::value , T ,
+        typename Kokkos::Future< typename T::execution_space > >::type
+      >( arg , arg_priority );
+}
+
+// Construct a TaskSingle execution policy
+
+template< typename T >
+Kokkos::Impl::TaskPolicyData
+  < Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
+  , typename std::conditional< Kokkos::is_future< T >::value , T ,
+    typename Kokkos::Future< typename T::execution_space > >::type
+  >
+KOKKOS_INLINE_FUNCTION
+TaskSingle( T            const & arg
+          , TaskPriority const & arg_priority = TaskPriority::Regular
+          )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos TaskSingle argument must be Future or TaskScheduler" );
+
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
+      , typename std::conditional< Kokkos::is_future< T >::value , T ,
+        typename Kokkos::Future< typename T::execution_space > >::type
+      >( arg , arg_priority );
+}
+
+//----------------------------------------------------------------------------
+
+/**\brief  A host control thread spawns a task with options
+ *
+ *  1) Team or Serial
+ *  2) With scheduler or dependence
+ *  3) High, Normal, or Low priority
+ */
+template< int TaskEnum
+        , typename DepFutureType
+        , typename FunctorType >
+Future< typename FunctorType::value_type
+      , typename DepFutureType::execution_space >
+host_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+          , FunctorType                                       && arg_functor
+          )
+{
+  using exec_space = typename DepFutureType::execution_space ;
+  using scheduler  = TaskScheduler< exec_space > ;
+
+  typedef Impl::TaskBase< exec_space
+                        , typename FunctorType::value_type
+                        , FunctorType
+                        > task_type ;
+
+  static_assert( TaskEnum == task_type::TaskTeam ||
+                 TaskEnum == task_type::TaskSingle
+               , "Kokkos host_spawn requires TaskTeam or TaskSingle" );
+
+  // May be spawning a Cuda task, must use the specialization
+  // to query on-device function pointer.
+  typename task_type::function_type const ptr =
+    Kokkos::Impl::TaskQueueSpecialization< exec_space >::
+      template get_function_pointer< task_type >();
+
+  return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
+}
+
+/**\brief  A task spawns a task with options
+ *
+ *  1) Team or Serial
+ *  2) With scheduler or dependence
+ *  3) High, Normal, or Low priority
+ */
+template< int TaskEnum
+        , typename DepFutureType
+        , typename FunctorType >
+Future< typename FunctorType::value_type
+      , typename DepFutureType::execution_space >
+KOKKOS_INLINE_FUNCTION
+task_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+          , FunctorType                                       && arg_functor
+          )
+{
+  using exec_space = typename DepFutureType::execution_space ;
+  using scheduler  = TaskScheduler< exec_space > ;
+
+  typedef Impl::TaskBase< exec_space
+                        , typename FunctorType::value_type
+                        , FunctorType
+                        > task_type ;
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) && \
+    defined( KOKKOS_ENABLE_CUDA )
+
+  static_assert( ! std::is_same< Kokkos::Cuda , exec_space >::value
+               , "Error calling Kokkos::task_spawn for Cuda space within Host code" );
+
+#endif
+
+  static_assert( TaskEnum == task_type::TaskTeam ||
+                 TaskEnum == task_type::TaskSingle
+               , "Kokkos host_spawn requires TaskTeam or TaskSingle" );
+
+  typename task_type::function_type const ptr = task_type::apply ;
+
+  return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
+}
+
+/**\brief  A task respawns itself with options
+ *
+ *  1) With scheduler or dependence
+ *  2) High, Normal, or Low priority
+ */
+template< typename FunctorType , typename T >
+void
+KOKKOS_INLINE_FUNCTION
+respawn( FunctorType         * arg_self
+       , T             const & arg
+       , TaskPriority  const & arg_priority = TaskPriority::Regular
+       )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos respawn argument must be Future or TaskScheduler" );
+
+  TaskScheduler< typename T::execution_space >::
+    respawn( arg_self , arg , arg_priority );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename A1 , typename A2 >
+KOKKOS_INLINE_FUNCTION
+Future< typename Future< A1 , A2 >::execution_space >
+when_all( Future< A1 , A2 > const arg[]
+        , int                     narg
+        )
+{
+  return TaskScheduler< typename Future<A1,A2>::execution_space >::
+    when_all( arg , narg );
+}
+
+//----------------------------------------------------------------------------
+// Wait for all runnable tasks to complete
+
 template< typename ExecSpace >
 inline
-void wait( TaskScheduler< ExecSpace > const & policy )
-{ policy.m_queue->execute(); }
+void wait( TaskScheduler< ExecSpace > const & scheduler )
+{ scheduler.m_queue->execute(); }

 } // namespace Kokkos

--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@ -230,4 +230,3 @@ struct VerifyExecutionCanAccessMemorySpace
 #endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) */
 #endif /* #define KOKKOS_THREADS_HPP */

-
--- a/lib/kokkos/core/src/Makefile
+++ b/lib/kokkos/core/src/Makefile
@ -40,9 +40,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
  CONDITIONAL_COPIES += copy-threads
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	KOKKOS_HEADERS_QTHREAD += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
-	CONDITIONAL_COPIES += copy-qthread
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  KOKKOS_HEADERS_QTHREADS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
+  CONDITIONAL_COPIES += copy-qthreads
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@ -60,6 +60,12 @@ ifeq ($(KOKKOS_OS),Darwin)
  COPY_FLAG =
 endif

+ifeq ($(KOKKOS_DEBUG),"no")
+  KOKKOS_DEBUG_CMAKE = OFF
+else
+  KOKKOS_DEBUG_CMAKE = ON
+endif
+
 messages: 
 	echo "Start Build"

@ -91,6 +97,7 @@ build-makefile-kokkos:
 	echo "" >> Makefile.kokkos
 	echo "#Internal settings which need to propagated for Kokkos examples" >> Makefile.kokkos
 	echo "KOKKOS_INTERNAL_USE_CUDA = ${KOKKOS_INTERNAL_USE_CUDA}" >> Makefile.kokkos
+	echo "KOKKOS_INTERNAL_USE_QTHREADS = ${KOKKOS_INTERNAL_USE_QTHREADS}" >> Makefile.kokkos
 	echo "KOKKOS_INTERNAL_USE_OPENMP = ${KOKKOS_INTERNAL_USE_OPENMP}" >> Makefile.kokkos
 	echo "KOKKOS_INTERNAL_USE_PTHREADS = ${KOKKOS_INTERNAL_USE_PTHREADS}" >> Makefile.kokkos
 	echo "" >> Makefile.kokkos
@ -107,7 +114,55 @@ build-makefile-kokkos:
 		> Makefile.kokkos.tmp
 	mv -f Makefile.kokkos.tmp Makefile.kokkos

-build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
+build-cmake-kokkos:
+	rm -f kokkos.cmake
+	echo "#Global Settings used to generate this library" >> kokkos.cmake
+	echo "set(KOKKOS_PATH $(PREFIX) CACHE PATH \"Kokkos installation path\")" >> kokkos.cmake
+	echo "set(KOKKOS_DEVICES $(KOKKOS_DEVICES) CACHE STRING \"Kokkos devices list\")" >> kokkos.cmake
+	echo "set(KOKKOS_ARCH $(KOKKOS_ARCH) CACHE STRING \"Kokkos architecture flags\")" >> kokkos.cmake
+	echo "set(KOKKOS_DEBUG $(KOKKOS_DEBUG_CMAKE) CACHE BOOL \"Kokkos debug enabled ?)\")" >> kokkos.cmake
+	echo "set(KOKKOS_USE_TPLS $(KOKKOS_USE_TPLS) CACHE STRING \"Kokkos templates list\")" >> kokkos.cmake
+	echo "set(KOKKOS_CXX_STANDARD $(KOKKOS_CXX_STANDARD) CACHE STRING \"Kokkos C++ standard\")" >> kokkos.cmake
+	echo "set(KOKKOS_OPTIONS $(KOKKOS_OPTIONS) CACHE STRING \"Kokkos options\")" >> kokkos.cmake
+	echo "set(KOKKOS_CUDA_OPTIONS $(KOKKOS_CUDA_OPTIONS) CACHE STRING \"Kokkos Cuda options\")" >> kokkos.cmake
+	echo "if(NOT $ENV{CXX})" >> kokkos.cmake
+	echo '  message(WARNING "You are currently using compiler $${CMAKE_CXX_COMPILER} while Kokkos was built with $(CXX) ; make sure this is the behavior you intended to be.")' >> kokkos.cmake
+	echo "endif()" >> kokkos.cmake
+	echo "if(NOT DEFINED ENV{NVCC_WRAPPER})" >> kokkos.cmake
+	echo "  set(NVCC_WRAPPER \"$(NVCC_WRAPPER)\" CACHE FILEPATH \"Path to command nvcc_wrapper\")" >> kokkos.cmake
+	echo "else()" >> kokkos.cmake
+	echo '  set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")' >> kokkos.cmake
+	echo "endif()" >> kokkos.cmake
+	echo "" >> kokkos.cmake  
+	echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> kokkos.cmake
+	echo "set(KOKKOS_HEADERS \"$(KOKKOS_HEADERS)\" CACHE STRING \"Kokkos headers list\")" >> kokkos.cmake
+	echo "set(KOKKOS_SRC \"$(KOKKOS_SRC)\" CACHE STRING \"Kokkos source list\")" >> kokkos.cmake
+	echo "" >> kokkos.cmake  
+	echo "#Variables used in application Makefiles" >> kokkos.cmake
+	echo "set(KOKKOS_CPP_DEPENDS \"$(KOKKOS_CPP_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_CXXFLAGS \"$(KOKKOS_CXXFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_CPPFLAGS \"$(KOKKOS_CPPFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_LINK_DEPENDS \"$(KOKKOS_LINK_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_LIBS \"$(KOKKOS_LIBS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_LDFLAGS \"$(KOKKOS_LDFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "" >> kokkos.cmake
+	echo "#Internal settings which need to propagated for Kokkos examples" >> kokkos.cmake
+	echo "set(KOKKOS_INTERNAL_USE_CUDA \"${KOKKOS_INTERNAL_USE_CUDA}\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_INTERNAL_USE_OPENMP \"${KOKKOS_INTERNAL_USE_OPENMP}\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_INTERNAL_USE_PTHREADS \"${KOKKOS_INTERNAL_USE_PTHREADS}\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "mark_as_advanced(KOKKOS_HEADERS KOKKOS_SRC KOKKOS_INTERNAL_USE_CUDA KOKKOS_INTERNAL_USE_OPENMP KOKKOS_INTERNAL_USE_PTHREADS)" >> kokkos.cmake
+	echo "" >> kokkos.cmake
+	sed \
+		-e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \
+	 	-e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \
+	 	-e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \
+	 	-e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \
+	 	-e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \
+	 	-e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' kokkos.cmake \
+	 	> kokkos.cmake.tmp
+	mv -f kokkos.cmake.tmp kokkos.cmake
+
+build-lib: build-makefile-kokkos build-cmake-kokkos $(KOKKOS_LINK_DEPENDS)

 mkdir: 
 	mkdir -p $(PREFIX)
@ -124,9 +179,9 @@ copy-threads: mkdir
 	mkdir -p $(PREFIX)/include/Threads
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads

-copy-qthread: mkdir
-	mkdir -p $(PREFIX)/include/Qthread
-	cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREAD) $(PREFIX)/include/Qthread
+copy-qthreads: mkdir
+	mkdir -p $(PREFIX)/include/Qthreads
+	cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREADS) $(PREFIX)/include/Qthreads

 copy-openmp: mkdir
 	mkdir -p $(PREFIX)/include/OpenMP
@ -137,6 +192,7 @@ install: mkdir $(CONDITIONAL_COPIES) build-lib
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
 	cp $(COPY_FLAG) Makefile.kokkos $(PREFIX)
+	cp $(COPY_FLAG) kokkos.cmake $(PREFIX)
 	cp $(COPY_FLAG) libkokkos.a $(PREFIX)/lib
 	cp $(COPY_FLAG) KokkosCore_config.h $(PREFIX)/include

--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@ -46,7 +46,6 @@

 #include <omp.h>
 #include <iostream>
-#include <Kokkos_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMPexec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>

@ -107,58 +106,41 @@ private:

 public:

-  inline void execute() const {
-    this->template execute_schedule<typename Policy::schedule_type::type>();
-  }
-
-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
-    execute_schedule() const
+  inline void execute() const
    {
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };
+
      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");

 #pragma omp parallel
      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();

-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );

-        ParallelFor::template exec_range< WorkTag >( m_functor , range.begin() , range.end() );
-      }
-/* END #pragma omp parallel */
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
        }

-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
-    execute_schedule() const
-    {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+        std::pair<int64_t,int64_t> range(0,0);

-#pragma omp parallel
-      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        do {

-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();

-        exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
-        exec.reset_steal_target();
-        #pragma omp barrier
+          ParallelFor::template
+            exec_range< WorkTag >( m_functor
+                                 , range.first  + m_policy.begin()
+                                 , range.second + m_policy.begin() );

-        long work_index = exec.get_work_index();
-
-        while(work_index != -1) {
-          const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
-          const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
-          ParallelFor::template exec_range< WorkTag >( m_functor , begin, end );
-          work_index = exec.get_work_index();
+        } while ( is_dynamic && 0 <= range.first );
      }
-
-      }
-/* END #pragma omp parallel */
+      // END #pragma omp parallel
    }

  inline
@ -193,17 +175,18 @@ private:
  typedef typename Policy::WorkRange    WorkRange ;
  typedef typename Policy::member_type  Member ;

+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
  typedef typename ReducerConditional::type ReducerTypeFwd;

  // Static Assert WorkTag void if ReducerType not InvalidType

-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;

-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;

  const FunctorType   m_functor ;
  const Policy        m_policy ;
@ -247,92 +230,70 @@ private:

 public:

-  inline void execute() const {
-    this->template execute_schedule<typename Policy::schedule_type::type>();
-  }
-
-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
-    execute_schedule() const
+  inline void execute() const
    {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };

-      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+      const size_t pool_reduce_bytes =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      OpenMPexec::resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );

 #pragma omp parallel
      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-        ParallelReduce::template exec_range< WorkTag >
-          ( m_functor , range.begin() , range.end()
-          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
-      }
-/* END #pragma omp parallel */
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();

-      // Reduction:
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );

-      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
-
-      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
        }

-      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+        reference_type update =
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                         , data.pool_reduce_local() );

-      if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        std::pair<int64_t,int64_t> range(0,0);

-        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
-      }
-    }
+        do {

-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
-    execute_schedule() const
-    {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();

-      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
-
-#pragma omp parallel
-      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-
-        exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
-        exec.reset_steal_target();
-        #pragma omp barrier
-
-        long work_index = exec.get_work_index();
-
-        reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
-        while(work_index != -1) {
-          const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
-          const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
-          ParallelReduce::template exec_range< WorkTag >
-            ( m_functor , begin,end
+          ParallelReduce::template
+            exec_range< WorkTag >( m_functor
+                                 , range.first  + m_policy.begin()
+                                 , range.second + m_policy.begin()
                                 , update );
-          work_index = exec.get_work_index();
+
+        } while ( is_dynamic && 0 <= range.first );
      }
-      }
-/* END #pragma omp parallel */
+// END #pragma omp parallel

      // Reduction:

-      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );

      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , OpenMPexec::get_thread_data(i)->pool_reduce_local() );
      }

      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );

      if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );

        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
      }
@ -394,17 +355,18 @@ private:

  typedef Kokkos::RangePolicy< Traits ... > Policy ;

+  typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
+
  typedef typename Policy::work_tag     WorkTag ;
  typedef typename Policy::WorkRange    WorkRange ;
  typedef typename Policy::member_type  Member ;

-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType, WorkTag > ValueJoin ;
  typedef Kokkos::Impl::FunctorValueOps<    FunctorType, WorkTag > ValueOps ;

-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;

  const FunctorType   m_functor ;
  const Policy        m_policy ;
@ -452,53 +414,63 @@ public:
      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");

-      OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
+      const int    value_count       = Analysis::value_count( m_functor );
+      const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
+
+      OpenMPexec::resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );

 #pragma omp parallel
      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-        const pointer_type ptr =
-          pointer_type( exec.scratch_reduce() ) +
-          ValueTraits::value_count( m_functor );
-        ParallelScan::template exec_range< WorkTag >
-          ( m_functor , range.begin() , range.end()
-          , ValueInit::init( m_functor , ptr ) , false );
-      }
-/* END #pragma omp parallel */
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();

-      {
-        const unsigned thread_count = OpenMPexec::pool_size();
-        const unsigned value_count  = ValueTraits::value_count( m_functor );
+        const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
+
+        reference_type update_sum =
+          ValueInit::init( m_functor , data.pool_reduce_local() );
+
+        ParallelScan::template exec_range< WorkTag >
+          ( m_functor , range.begin() , range.end() , update_sum , false );
+
+        if ( data.pool_rendezvous() ) {

          pointer_type ptr_prev = 0 ;

-        for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
+          const int n = data.pool_size();

-          pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
+          for ( int i = 0 ; i < n ; ++i ) {

-          if ( ptr_prev ) {
-            for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
-            ValueJoin::join( m_functor , ptr + value_count , ptr );
+            pointer_type ptr = (pointer_type)
+              data.pool_member(i)->pool_reduce_local();
+
+            if ( i ) {
+              for ( int j = 0 ; j < value_count ; ++j ) {
+                ptr[j+value_count] = ptr_prev[j+value_count] ;
+              }
+              ValueJoin::join( m_functor , ptr + value_count , ptr_prev );
            }
            else {
-            ValueInit::init( m_functor , ptr );
+              ValueInit::init( m_functor , ptr + value_count );
            }

            ptr_prev = ptr ;
          }
+
+          data.pool_rendezvous_release();
        }

-#pragma omp parallel
-      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-        const pointer_type ptr = pointer_type( exec.scratch_reduce() );
+        reference_type update_base =
+          ValueOps::reference
+            ( ((pointer_type)data.pool_reduce_local()) + value_count );
+
        ParallelScan::template exec_range< WorkTag >
-          ( m_functor , range.begin() , range.end()
-          , ValueOps::reference( ptr ) , true );
+          ( m_functor , range.begin() , range.end() , update_base , true );
      }
 /* END #pragma omp parallel */
+
    }

  //----------------------------------------
@ -530,55 +502,59 @@ class ParallelFor< FunctorType
 {
 private:

+  enum { TEAM_REDUCE_SIZE = 512 };
+
  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... > Policy ;
  typedef typename Policy::work_tag             WorkTag ;
+  typedef typename Policy::schedule_type::type  SchedTag ;
  typedef typename Policy::member_type          Member ;

  const FunctorType  m_functor ;
  const Policy       m_policy ;
  const int          m_shmem_size ;

-  template< class TagType, class Schedule >
+  template< class TagType >
  inline static
-  typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Static>::value>::type
-  exec_team( const FunctorType & functor , Member member )
+  typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
    {
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( member );
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( Member( data, r , league_size ) );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
      }
    }

-  template< class TagType, class Schedule >
-  inline static
-  typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Static>::value >::type
-  exec_team( const FunctorType & functor , Member member )
-    {
-      const TagType t{} ;
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( t , member );
-      }
-    }

-  template< class TagType, class Schedule >
+  template< class TagType >
  inline static
-  typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Dynamic>::value>::type
-  exec_team( const FunctorType & functor , Member member )
+  typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
    {
-      #pragma omp barrier
-      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
-        functor( member );
-      }
-    }
+      const TagType t{};

-  template< class TagType, class Schedule >
-  inline static
-  typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Dynamic>::value >::type
-  exec_team( const FunctorType & functor , Member member )
-    {
-      #pragma omp barrier
-      const TagType t{} ;
-      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
-        functor( t , member );
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( t , Member( data, r , league_size ) );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
      }
    }

@ -587,31 +563,75 @@ public:
  inline
  void execute() const
    {
+      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
+
      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");

-      const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+      const size_t pool_reduce_size = 0 ; // Never shrinks
+      const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
+      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
+      const size_t thread_local_size = 0 ; // Never shrinks

-      OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
+      OpenMPexec::resize_thread_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );

 #pragma omp parallel
      {
-        ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
-          ( m_functor
-          , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+
+        const int active = data.organize_team( m_policy.team_size() );
+
+        if ( active ) {
+          data.set_work_partition( m_policy.league_size()
+                                 , ( 0 < m_policy.chunk_size()
+                                   ? m_policy.chunk_size()
+                                   : m_policy.team_iter() ) );
        }
-/* END #pragma omp parallel */
+
+        if ( is_dynamic ) {
+          // Must synchronize to make sure each team has set its
+          // partition before begining the work stealing loop.
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
        }

+        if ( active ) {
+
+          std::pair<int64_t,int64_t> range(0,0);
+
+          do {
+
+            range = is_dynamic ? data.get_work_stealing_chunk()
+                               : data.get_work_partition();
+
+            ParallelFor::template exec_team< WorkTag >
+              ( m_functor , data
+              , range.first , range.second , m_policy.league_size() );
+
+          } while ( is_dynamic && 0 <= range.first );
+        }
+
+        data.disband_team();
+      }
+// END #pragma omp parallel
+    }
+
+
  inline
  ParallelFor( const FunctorType & arg_functor ,
               const Policy      & arg_policy )
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
-    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) +
+                    arg_policy.scratch_size(1) +
+                    FunctorTeamShmemSize< FunctorType >
+                      ::value( arg_functor , arg_policy.team_size() ) )
    {}
 };

+//----------------------------------------------------------------------------

 template< class FunctorType , class ReducerType, class ... Properties >
 class ParallelReduce< FunctorType
@ -622,20 +642,26 @@ class ParallelReduce< FunctorType
 {
 private:

+  enum { TEAM_REDUCE_SIZE = 512 };
+
  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... >         Policy ;

+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
  typedef typename Policy::work_tag             WorkTag ;
+  typedef typename Policy::schedule_type::type  SchedTag ;
  typedef typename Policy::member_type          Member ;

-  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value
+                            , FunctorType, ReducerType> ReducerConditional;
+
  typedef typename ReducerConditional::type ReducerTypeFwd;

-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTag >  ValueJoin ;

-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;

  const FunctorType  m_functor ;
  const Policy       m_policy ;
@ -645,22 +671,48 @@ private:

  template< class TagType >
  inline static
-  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_team( const FunctorType & functor , Member member , reference_type update )
+  typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , reference_type     & update
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
    {
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( member , update );
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( Member( data, r , league_size ) , update );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
      }
    }

+
  template< class TagType >
  inline static
-  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_team( const FunctorType & functor , Member member , reference_type update )
+  typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , reference_type     & update
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
    {
-      const TagType t{} ;
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( t , member , update );
+      const TagType t{};
+
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( t , Member( data, r , league_size ) , update );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
      }
    }

@ -669,43 +721,88 @@ public:
  inline
  void execute() const
    {
+      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
+
      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");

-      const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));

-      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
+      const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
+      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      OpenMPexec::resize_thread_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );

 #pragma omp parallel
      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+
+        const int active = data.organize_team( m_policy.team_size() );
+
+        if ( active ) {
+          data.set_work_partition( m_policy.league_size()
+                                 , ( 0 < m_policy.chunk_size()
+                                   ? m_policy.chunk_size()
+                                   : m_policy.team_iter() ) );
+        }
+
+        if ( is_dynamic ) {
+          // Must synchronize to make sure each team has set its
+          // partition before begining the work stealing loop.
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        if ( active ) {
+          reference_type update =
+            ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                           , data.pool_reduce_local() );
+
+          std::pair<int64_t,int64_t> range(0,0);
+
+          do {
+
+            range = is_dynamic ? data.get_work_stealing_chunk()
+                               : data.get_work_partition();

            ParallelReduce::template exec_team< WorkTag >
-          ( m_functor
-          , Member( exec , m_policy , m_shmem_size, 0 )
-          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
+              ( m_functor , data , update
+              , range.first , range.second , m_policy.league_size() );
+
+          } while ( is_dynamic && 0 <= range.first );
+        } else {
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                           , data.pool_reduce_local() );
        }
-/* END #pragma omp parallel */

-      {
-        const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+        data.disband_team();
+      }
+// END #pragma omp parallel

-        int max_active_threads = OpenMPexec::pool_size();
-        if( max_active_threads > m_policy.league_size()* m_policy.team_size() )
-          max_active_threads = m_policy.league_size()* m_policy.team_size();
+      // Reduction:

-        for ( int i = 1 ; i < max_active_threads ; ++i ) {
-          ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , OpenMPexec::get_thread_data(i)->pool_reduce_local() );
      }

      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );

      if ( m_result_ptr ) {
-          const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );

        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
      }
    }
-    }
+
+  //----------------------------------------

  template< class ViewType >
  inline
@ -720,7 +817,10 @@ public:
    , m_policy(  arg_policy )
    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) +
+                    arg_policy.scratch_size(1) +
+                    FunctorTeamShmemSize< FunctorType >
+                      ::value( arg_functor , arg_policy.team_size() ) )
    {}

  inline
@ -731,7 +831,10 @@ public:
  , m_policy(  arg_policy )
  , m_reducer( reducer )
  , m_result_ptr(  reducer.result_view().data() )
-  , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  , m_shmem_size( arg_policy.scratch_size(0) +
+                  arg_policy.scratch_size(1) +
+                  FunctorTeamShmemSize< FunctorType >
+                    ::value( arg_functor , arg_policy.team_size() ) )
  {
  /*static_assert( std::is_same< typename ViewType::memory_space
                          , Kokkos::HostSpace >::value
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@ -46,6 +46,7 @@
 #if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )

 #include <impl/Kokkos_TaskQueue_impl.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@ -55,105 +56,46 @@ namespace Impl {

 template class TaskQueue< Kokkos::OpenMP > ;

-//----------------------------------------------------------------------------
+class HostThreadTeamDataSingleton : private HostThreadTeamData {
+private:

-TaskExec< Kokkos::OpenMP >::
-TaskExec()
-  : m_self_exec( 0 )
-  , m_team_exec( 0 )
-  , m_sync_mask( 0 )
-  , m_sync_value( 0 )
-  , m_sync_step( 0 )
-  , m_group_rank( 0 )
-  , m_team_rank( 0 )
-  , m_team_size( 1 )
-{
-}
+  HostThreadTeamDataSingleton() : HostThreadTeamData()
+    {
+      Kokkos::OpenMP::memory_space space ;
+      const size_t num_pool_reduce_bytes  =   32 ;
+      const size_t num_team_reduce_bytes  =   32 ;
+      const size_t num_team_shared_bytes  = 1024 ;
+      const size_t num_thread_local_bytes = 1024 ;
+      const size_t alloc_bytes =
+        HostThreadTeamData::scratch_size( num_pool_reduce_bytes
+                                        , num_team_reduce_bytes
+                                        , num_team_shared_bytes
+                                        , num_thread_local_bytes );

-TaskExec< Kokkos::OpenMP >::
-TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
-  : m_self_exec( & arg_exec )
-  , m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
-  , m_sync_mask( 0 )
-  , m_sync_value( 0 )
-  , m_sync_step( 0 )
-  , m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
-  , m_team_rank(  arg_exec.pool_rank_rev() % arg_team_size )
-  , m_team_size(  arg_team_size )
-{
-  // This team spans
-  //    m_self_exec->pool_rev( team_size * group_rank )
-  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
-
-  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
-
-  sync[0] = int64_t(0) ;
-  sync[1] = int64_t(0) ;
-
-  for ( int i = 0 ; i < m_team_size ; ++i ) {
-    m_sync_value |= int64_t(1) << (8*i);
-    m_sync_mask  |= int64_t(3) << (8*i);
+      HostThreadTeamData::scratch_assign
+        ( space.allocate( alloc_bytes )
+        , alloc_bytes
+        , num_pool_reduce_bytes
+        , num_team_reduce_bytes
+        , num_team_shared_bytes
+        , num_thread_local_bytes );
    }

-  Kokkos::memory_fence();
-}
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-
-void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
-{
-  if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
-    Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
+  ~HostThreadTeamDataSingleton()
+    {
+      Kokkos::OpenMP::memory_space space ;
+      space.deallocate( HostThreadTeamData::scratch_buffer()
+                      , HostThreadTeamData::scratch_bytes() );
    }

-  // Use team shared memory to synchronize.
-  // Alternate memory locations between barriers to avoid a sequence
-  // of barriers overtaking one another.
+public:

-  int64_t volatile * const sync =
-    ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
-
-  // This team member sets one byte within the sync variable
-  int8_t volatile * const sync_self =
-   ((int8_t *) sync) + m_team_rank ;
-
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
-
-  *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
-
-  while ( m_sync_value != *sync ); // wait for team to arrive
-
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
-
-  ++m_sync_step ;
-
-  if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
-    m_sync_value ^= m_sync_mask ;
-    if ( 1000 < m_sync_step ) m_sync_step = 0 ;
+  static HostThreadTeamData & singleton()
+    {
+      static HostThreadTeamDataSingleton s ;
+      return s ;
    }
-}
-
-#endif
+};

 //----------------------------------------------------------------------------

@ -163,123 +105,165 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
  using execution_space = Kokkos::OpenMP ;
  using queue_type      = TaskQueue< execution_space > ;
  using task_root_type  = TaskBase< execution_space , void , void > ;
-  using PoolExec        = Kokkos::Impl::OpenMPexec ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;

-  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+  static task_root_type * const end =
+    (task_root_type *) task_root_type::EndTag ;

-  // Required:  team_size <= 8
+  HostThreadTeamData & team_data_single =
+    HostThreadTeamDataSingleton::singleton();

-  const int team_size = PoolExec::pool_size(2); // Threads per core
-  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+  const int team_size = Impl::OpenMPexec::pool_size(2); // Threads per core
+  // const int team_size = Impl::OpenMPexec::pool_size(1); // Threads per NUMA
+
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
+fflush(stdout);
+#endif

-  if ( 8 < team_size ) {
-    Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
-  }

 #pragma omp parallel
  {
-    PoolExec & self = *PoolExec::get_thread_omp();
+    Impl::HostThreadTeamData & self = *Impl::OpenMPexec::get_thread_data();

-    Member single_exec ;
-    Member team_exec( self , team_size );
+    // Organizing threads into a team performs a barrier across the
+    // entire pool to insure proper initialization of the team
+    // rendezvous mechanism before a team rendezvous can be performed.

-    // Team shared memory
-    task_root_type * volatile * const task_shared =
-      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+    if ( self.organize_team( team_size ) ) {

-// Barrier across entire OpenMP thread pool to insure initialization
-#pragma omp barrier
+      Member single_exec( team_data_single );
+      Member team_exec( self );
+
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) running\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , team_exec.team_rank()
+       , team_exec.team_size()
+       , team_exec.league_rank()
+       , team_exec.league_size()
+       );
+fflush(stdout);
+#endif

      // Loop until all queues are empty and no tasks in flight

-    do {
-
      task_root_type * task = 0 ;

+      do {
        // Each team lead attempts to acquire either a thread team task
        // or a single thread task for the team.

        if ( 0 == team_exec.team_rank() ) {

+          bool leader_loop = false ;
+
+          do {
+
+            if ( 0 != task && end != task ) {
+              // team member #0 completes the previously executed task,
+              // completion may delete the task
+              queue->complete( task ); 
+            }
+
+            // If 0 == m_ready_count then set task = 0
+
            task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;

+            // Attempt to acquire a task
            // Loop by priority and then type
            for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
              for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-            task = queue_type::pop_task( & queue->m_ready[i][j] );
-          }
+                task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
              }
            }

-      // Team lead broadcast acquired task to team members:
+            // If still tasks are still executing
+            // and no task could be acquired
+            // then continue this leader loop
+            leader_loop = end == task ;

-      if ( 1 < team_exec.team_size() ) {
+            if ( ( ! leader_loop ) &&
+                 ( 0 != task ) &&
+                 ( task_root_type::TaskSingle == task->m_task_type ) ) {

-        if ( 0 == team_exec.team_rank() ) *task_shared = task ;
-
-        // Fence to be sure task_shared is stored before the barrier
-        Kokkos::memory_fence();
-
-        // Whole team waits for every team member to reach this statement
-        team_exec.team_barrier();
-
-        // Fence to be sure task_shared is stored
-        Kokkos::memory_fence();
-
-        task = *task_shared ;
-      }
+              // if a single thread task then execute now

 #if 0
-fprintf( stdout
-       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
-       , team_exec.m_group_rank
-       , team_exec.m_team_rank
-       , uintptr_t(task_shared)
-       , uintptr_t(task)
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) executing single task 0x%lx\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , int64_t(task)
       );
 fflush(stdout);
 #endif

-      if ( 0 == task ) break ; // 0 == m_ready_count
+              (*task->m_apply)( task , & single_exec );

-      if ( end == task ) {
-        // All team members wait for whole team to reach this statement.
-        // Is necessary to prevent task_shared from being updated
-        // before it is read by all threads.
-        team_exec.team_barrier();
+              leader_loop = true ;
            }
-      else if ( task_root_type::TaskTeam == task->m_task_type ) {
-        // Thread Team Task
+          } while ( leader_loop );
+        }
+
+        // Team lead either found 0 == m_ready_count or a team task
+        // Team lead broadcast acquired task:
+
+        team_exec.team_broadcast( task , 0);
+
+        if ( 0 != task ) { // Thread Team Task
+
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , team_exec.team_rank()
+       , team_exec.team_size()
+       , team_exec.league_rank()
+       , team_exec.league_size()
+       , int64_t(task)
+       );
+fflush(stdout);
+#endif
+
          (*task->m_apply)( task , & team_exec );

          // The m_apply function performs a barrier
-
-        if ( 0 == team_exec.team_rank() ) {
-          // team member #0 completes the task, which may delete the task
-          queue->complete( task ); 
        }
-      }
-      else {
-        // Single Thread Task
+      } while( 0 != task );

-        if ( 0 == team_exec.team_rank() ) {
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) ending\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , team_exec.team_rank()
+       , team_exec.team_size()
+       , team_exec.league_rank()
+       , team_exec.league_size()
+       );
+fflush(stdout);
+#endif

-          (*task->m_apply)( task , & single_exec );
-
-          queue->complete( task ); 
    }

-        // All team members wait for whole team to reach this statement.
-        // Not necessary to complete the task.
-        // Is necessary to prevent task_shared from being updated
-        // before it is read by all threads.
-        team_exec.team_barrier();
-      }
-    } while(1);
+    self.disband_team();
+
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) disbanded\n"
+       , self.pool_rank()
+       , self.pool_size()
+       );
+fflush(stdout);
+#endif
+
  }
 // END #pragma omp parallel

+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> execute %d end\n", team_size );
+fflush(stdout);
+#endif
+
 }

 void TaskQueueSpecialization< Kokkos::OpenMP >::
@ -289,13 +273,16 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
  using execution_space = Kokkos::OpenMP ;
  using queue_type      = TaskQueue< execution_space > ;
  using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;

  if ( 1 == omp_get_num_threads() ) {

    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;

-    Member single_exec ;
+    HostThreadTeamData & team_data_single =
+      HostThreadTeamDataSingleton::singleton();
+
+    Member single_exec( team_data_single );

    task_root_type * task = end ;

@ -306,7 +293,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
      // Loop by priority and then type
      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-          task = queue_type::pop_task( & queue->m_ready[i][j] );
+          task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
        }
      }

--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@ -60,6 +60,7 @@ public:
  using execution_space = Kokkos::OpenMP ;
  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;

  // Must specify memory space
  using memory_space = Kokkos::HostSpace ;
@ -70,296 +71,19 @@ public:
  // Must provide task queue execution function
  static void execute( queue_type * const );

-  // Must provide mechanism to set function pointer in
-  // execution space from the host process.
-  template< typename FunctorType >
+  template< typename TaskType >
  static
-  void proc_set_apply( task_base_type::function_type * ptr )
-    {
-      using TaskType = TaskBase< Kokkos::OpenMP
-                               , typename FunctorType::value_type
-                               , FunctorType
-                               > ;
-       *ptr = TaskType::apply ;
-    }
+  typename TaskType::function_type
+  get_function_pointer() { return TaskType::apply ; }
 };

 extern template class TaskQueue< Kokkos::OpenMP > ;

-//----------------------------------------------------------------------------
-
-template<>
-class TaskExec< Kokkos::OpenMP >
-{
-private:
-
-  TaskExec( TaskExec && ) = delete ;
-  TaskExec( TaskExec const & ) = delete ;
-  TaskExec & operator = ( TaskExec && ) = delete ;
-  TaskExec & operator = ( TaskExec const & ) = delete ;
-
-
-  using PoolExec = Kokkos::Impl::OpenMPexec ;
-
-  friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
-  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
-
-  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure 
-  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
-  int64_t          m_sync_mask ;
-  int64_t mutable  m_sync_value ;
-  int     mutable  m_sync_step ;
-  int              m_group_rank ; ///< Which "team" subset of thread pool
-  int              m_team_rank ;  ///< Which thread within a team
-  int              m_team_size ;
-
-  TaskExec();
-  TaskExec( PoolExec & arg_exec , int arg_team_size );
-
-  void team_barrier_impl() const ;
-
-public:
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-  void * team_shared() const
-    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
-
-  int team_shared_size() const
-    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
-
-  /**\brief  Whole team enters this function call
-   *         before any teeam member returns from
-   *         this function call.
-   */
-  void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
-#else
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
-  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
-#endif
-
-  KOKKOS_INLINE_FUNCTION
-  int team_rank() const { return m_team_rank ; }
-
-  KOKKOS_INLINE_FUNCTION
-  int team_size() const { return m_team_size ; }
-};
-
 }} /* namespace Kokkos::Impl */

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

-namespace Kokkos {
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
-TeamThreadRange
-  ( Impl::TaskExec< Kokkos::OpenMP > & thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
-}
-
-template<typename iType1, typename iType2>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::TaskExec< Kokkos::OpenMP > >
-TeamThreadRange
-  ( Impl:: TaskExec< Kokkos::OpenMP > & thread, const iType1 & begin, const iType2 & end )
-{
-  typedef typename std::common_type<iType1, iType2>::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::TaskExec< Kokkos::OpenMP > >(thread, begin, end);
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
-ThreadVectorRange
-  ( Impl::TaskExec< Kokkos::OpenMP > & thread
-  , const iType & count )
-{
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
-}
-
-/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team.
- * This functionality requires C++11 support.
-*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for
-  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
-  , const Lambda& lambda
-  )
-{
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i);
-  }
-}
-
-template<typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
-  , const Lambda& lambda
-  , ValueType& initialized_result)
-{
-  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i, result);
-  }
-
-  if ( 1 < loop_boundaries.thread.team_size() ) {
-
-    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
-
-    loop_boundaries.thread.team_barrier();
-    shared[team_rank] = result;
-
-    loop_boundaries.thread.team_barrier();
-
-    // reduce across threads to thread 0
-    if (team_rank == 0) {
-      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
-        shared[0] += shared[i];
-      }
-    }
-
-    loop_boundaries.thread.team_barrier();
-
-    // broadcast result
-    initialized_result = shared[0];
-  }
-  else {
-    initialized_result = result ;
-  }
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i, result);
-  }
-
-  if ( 1 < loop_boundaries.thread.team_size() ) {
-    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
-
-    loop_boundaries.thread.team_barrier();
-    shared[team_rank] = result;
-
-    loop_boundaries.thread.team_barrier();
-
-    // reduce across threads to thread 0
-    if (team_rank == 0) {
-      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
-        join(shared[0], shared[i]);
-      }
-    }
-
-    loop_boundaries.thread.team_barrier();
-
-    // broadcast result
-    initialized_result = shared[0];
-  }
-  else {
-    initialized_result = result ;
-  }
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-}
-
-template< typename ValueType, typename iType, class Lambda >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda)
-{
-  ValueType accum = 0 ;
-  ValueType val, local_total;
-  ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
-  int team_size = loop_boundaries.thread.team_size();
-  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
-
-  // Intra-member scan
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    local_total = 0;
-    lambda(i,local_total,false);
-    val = accum;
-    lambda(i,val,true);
-    accum += local_total;
-  }
-
-  shared[team_rank] = accum;
-  loop_boundaries.thread.team_barrier();
-
-  // Member 0 do scan on accumulated totals
-  if (team_rank == 0) {
-    for( iType i = 1; i < team_size; i+=1) {
-      shared[i] += shared[i-1];
-    }
-    accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
-  }
-
-  loop_boundaries.thread.team_barrier();
-
-  // Inter-member scan adding in accumulated totals
-  if (team_rank != 0) { accum = shared[team_rank-1]; }
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    local_total = 0;
-    lambda(i,local_total,false);
-    val = accum;
-    lambda(i,val,true);
-    accum += local_total;
-  }
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda)
-{
-}
-
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */

--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@ -86,7 +86,7 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };

 int OpenMPexec::m_pool_topo[ 4 ] = { 0 };

-OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+HostThreadTeamData * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };

 void OpenMPexec::verify_is_process( const char * const label )
 {
@ -113,67 +113,110 @@ void OpenMPexec::verify_initialized( const char * const label )

 }

-void OpenMPexec::clear_scratch()
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void OpenMPexec::clear_thread_data()
 {
+  const size_t member_bytes =
+    sizeof(int64_t) *
+    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
+
+  const int old_alloc_bytes =
+    m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;
+
+  Kokkos::HostSpace space ;
+
 #pragma omp parallel
  {
-    const int rank_rev = m_map_rank[ omp_get_thread_num() ];
-    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
-    if ( m_pool[ rank_rev ] ) {
-      Record * const r = Record::get_record( m_pool[ rank_rev ] );
-      m_pool[ rank_rev ] = 0 ;
-      Record::decrement( r );
+    const int rank = m_map_rank[ omp_get_thread_num() ];
+
+    if ( 0 != m_pool[rank] ) {
+
+      m_pool[rank]->disband_pool();
+
+      space.deallocate( m_pool[rank] , old_alloc_bytes );
+
+      m_pool[rank] = 0 ;
    }
  }
 /* END #pragma omp parallel */
 }

-void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
+void OpenMPexec::resize_thread_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes )
 {
-  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
-  enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
+  const size_t member_bytes =
+    sizeof(int64_t) *
+    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );

-  const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
-  const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
+  HostThreadTeamData * root = m_pool[0] ;

-  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
-  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  const size_t old_pool_reduce  = root ? root->pool_reduce_bytes() : 0 ;
+  const size_t old_team_reduce  = root ? root->team_reduce_bytes() : 0 ;
+  const size_t old_team_shared  = root ? root->team_shared_bytes() : 0 ;
+  const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ;
+  const size_t old_alloc_bytes  = root ? ( member_bytes + root->scratch_bytes() ) : 0 ;

-  // Requesting allocation and old allocation is too small:
+  // Allocate if any of the old allocation is tool small:

-  const bool allocate = ( old_reduce_size < reduce_size ) ||
-                        ( old_thread_size < thread_size );
-
-  if ( allocate ) {
-    if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
-    if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
-  }
-
-  const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
-  const int    pool_size  = m_pool_topo[0] ;
+  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
+                        ( old_team_reduce  < team_reduce_bytes ) ||
+                        ( old_team_shared  < team_shared_bytes ) ||
+                        ( old_thread_local < thread_local_bytes );

  if ( allocate ) {

-    clear_scratch();
+    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
+    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
+    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
+    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
+
+    const size_t alloc_bytes =
+      member_bytes +
+      HostThreadTeamData::scratch_size( pool_reduce_bytes
+                                      , team_reduce_bytes
+                                      , team_shared_bytes
+                                      , thread_local_bytes );
+
+    const int pool_size = omp_get_max_threads();
+
+    Kokkos::HostSpace space ;

 #pragma omp parallel
    {
-      const int rank_rev = m_map_rank[ omp_get_thread_num() ];
-      const int rank     = pool_size - ( rank_rev + 1 );
+      const int rank = m_map_rank[ omp_get_thread_num() ];

-      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+      if ( 0 != m_pool[rank] ) {

-      Record * const r = Record::allocate( Kokkos::HostSpace()
-                                         , "openmp_scratch"
-                                         , alloc_size );
+        m_pool[rank]->disband_pool();

-      Record::increment( r );
+        space.deallocate( m_pool[rank] , old_alloc_bytes );
+      }

-      m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
+      void * const ptr = space.allocate( alloc_bytes );

-      new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
+      m_pool[ rank ] = new( ptr ) HostThreadTeamData();
+
+      m_pool[ rank ]->
+        scratch_assign( ((char *)ptr) + member_bytes
+                      , alloc_bytes
+                      , pool_reduce_bytes
+                      , team_reduce_bytes
+                      , team_shared_bytes
+                      , thread_local_bytes );
    }
 /* END #pragma omp parallel */
+
+    HostThreadTeamData::organize_pool( m_pool , pool_size );
  }
 }

@ -197,14 +240,14 @@ void OpenMP::initialize( unsigned thread_count ,
  // Before any other call to OMP query the maximum number of threads
  // and save the value for re-initialization unit testing.

-  //Using omp_get_max_threads(); is problematic in conjunction with
-  //Hwloc on Intel (essentially an initial call to the OpenMP runtime
-  //without a parallel region before will set a process mask for a single core
-  //The runtime will than bind threads for a parallel region to other cores on the
-  //entering the first parallel region and make the process mask the aggregate of
-  //the thread masks. The intend seems to be to make serial code run fast, if you
-  //compile with OpenMP enabled but don't actually use parallel regions or so
-  //static int omp_max_threads = omp_get_max_threads();
+  // Using omp_get_max_threads(); is problematic in conjunction with
+  // Hwloc on Intel (essentially an initial call to the OpenMP runtime
+  // without a parallel region before will set a process mask for a single core
+  // The runtime will than bind threads for a parallel region to other cores on the
+  // entering the first parallel region and make the process mask the aggregate of
+  // the thread masks. The intend seems to be to make serial code run fast, if you
+  // compile with OpenMP enabled but don't actually use parallel regions or so
+  // static int omp_max_threads = omp_get_max_threads();
  int nthreads = 0;
  #pragma omp parallel
  {
@ -268,8 +311,6 @@ void OpenMP::initialize( unsigned thread_count ,
        // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
        // Call to 'new' may not be thread safe as well.

-        // Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
-
        const unsigned omp_rank    = omp_get_thread_num();
        const unsigned thread_r    = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
                                   ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
@ -286,7 +327,19 @@ void OpenMP::initialize( unsigned thread_count ,
      Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
      Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;

-      Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
+      // New, unified host thread team data:
+      {
+        size_t pool_reduce_bytes  =   32 * thread_count ;
+        size_t team_reduce_bytes  =   32 * thread_count ;
+        size_t team_shared_bytes  = 1024 * thread_count ;
+        size_t thread_local_bytes = 1024 ;
+
+        Impl::OpenMPexec::resize_thread_data( pool_reduce_bytes
+                                            , team_reduce_bytes
+                                            , team_shared_bytes
+                                            , thread_local_bytes
+                                            );
+      }
    }
  }

@ -309,7 +362,7 @@ void OpenMP::initialize( unsigned thread_count ,
  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();

-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
  #endif
 }
@ -321,7 +374,8 @@ void OpenMP::finalize()
  Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
  Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );

-  Impl::OpenMPexec::clear_scratch();
+  // New, unified host thread team data:
+  Impl::OpenMPexec::clear_thread_data();

  Impl::OpenMPexec::m_pool_topo[0] = 0 ;
  Impl::OpenMPexec::m_pool_topo[1] = 0 ;
@ -333,7 +387,7 @@ void OpenMP::finalize()
    hwloc::unbind_this_thread();
  }

-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
  #endif
 }
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@ -44,13 +44,22 @@
 #ifndef KOKKOS_OPENMPEXEC_HPP
 #define KOKKOS_OPENMPEXEC_HPP

+#include <Kokkos_OpenMP.hpp>
+
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>

 #include <Kokkos_Atomic.hpp>
+
 #include <iostream>
 #include <sstream>
 #include <fstream>
+
+#include <omp.h>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
 namespace Kokkos {
 namespace Impl {

@ -60,41 +69,19 @@ namespace Impl {
 class OpenMPexec {
 public:

+  friend class Kokkos::OpenMP ;
+
  enum { MAX_THREAD_COUNT = 4096 };

 private:

-  static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
-
  static int          m_pool_topo[ 4 ];
  static int          m_map_rank[ MAX_THREAD_COUNT ];

-  friend class Kokkos::OpenMP ;
+  static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];

-  int const  m_pool_rank ;
-  int const  m_pool_rank_rev ;
-  int const  m_scratch_exec_end ;
-  int const  m_scratch_reduce_end ;
-  int const  m_scratch_thread_end ;
-
-  int volatile  m_barrier_state ;
-
-  // Members for dynamic scheduling
-  // Which thread am I stealing from currently
-  int m_current_steal_target;
-  // This thread's owned work_range
-  Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN(16);
-  // Team Offset if one thread determines work_range for others
-  long m_team_work_index;
-
-  // Is this thread stealing (i.e. its owned work_range is exhausted
-  bool m_stealing;
-
-  OpenMPexec();
-  OpenMPexec( const OpenMPexec & );
-  OpenMPexec & operator = ( const OpenMPexec & );
-
-  static void clear_scratch();
+  static
+  void clear_thread_data();

 public:

@ -108,44 +95,6 @@ public:
  inline static
  int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }

-  inline static
-  OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
-
-  inline int pool_rank() const { return m_pool_rank ; }
-  inline int pool_rank_rev() const { return m_pool_rank_rev ; }
-
-  inline long team_work_index() const { return m_team_work_index ; }
-
-  inline int scratch_reduce_size() const
-    { return m_scratch_reduce_end - m_scratch_exec_end ; }
-
-  inline int scratch_thread_size() const
-    { return m_scratch_thread_end - m_scratch_reduce_end ; }
-
-  inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
-  inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
-
-  inline
-  void state_wait( int state )
-    { Impl::spinwait( m_barrier_state , state ); }
-
-  inline
-  void state_set( int state ) { m_barrier_state = state ; }
-
-  ~OpenMPexec() {}
-
-  OpenMPexec( const int arg_poolRank
-            , const int arg_scratch_exec_size
-            , const int arg_scratch_reduce_size
-            , const int arg_scratch_thread_size )
-    : m_pool_rank( arg_poolRank )
-    , m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
-    , m_scratch_exec_end( arg_scratch_exec_size )
-    , m_scratch_reduce_end( m_scratch_exec_end   + arg_scratch_reduce_size )
-    , m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
-    , m_barrier_state(0)
-    {}
-
  static void finalize();

  static void initialize( const unsigned team_count ,
@ -156,133 +105,20 @@ public:
  static void verify_is_process( const char * const );
  static void verify_initialized( const char * const );

-  static void resize_scratch( size_t reduce_size , size_t thread_size );
+
+  static
+  void resize_thread_data( size_t pool_reduce_bytes
+                         , size_t team_reduce_bytes
+                         , size_t team_shared_bytes
+                         , size_t thread_local_bytes );

  inline static
-  OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
-
-  /* Dynamic Scheduling related functionality */
-  // Initialize the work range for this thread
-  inline void set_work_range(const long& begin, const long& end, const long& chunk_size) {
-    m_work_range.first = (begin+chunk_size-1)/chunk_size;
-    m_work_range.second = end>0?(end+chunk_size-1)/chunk_size:m_work_range.first;
-  }
-
-  // Claim and index from this thread's range from the beginning
-  inline long get_work_index_begin () {
-    Kokkos::pair<long,long> work_range_new = m_work_range;
-    Kokkos::pair<long,long> work_range_old = work_range_new;
-    if(work_range_old.first>=work_range_old.second)
-      return -1;
-
-    work_range_new.first+=1;
-
-    bool success = false;
-    while(!success) {
-      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
-      success = ( (work_range_new == work_range_old) ||
-                  (work_range_new.first>=work_range_new.second));
-      work_range_old = work_range_new;
-      work_range_new.first+=1;
-    }
-    if(work_range_old.first<work_range_old.second)
-      return work_range_old.first;
-    else
-      return -1;
-  }
-
-  // Claim and index from this thread's range from the end
-  inline long get_work_index_end () {
-    Kokkos::pair<long,long> work_range_new = m_work_range;
-    Kokkos::pair<long,long> work_range_old = work_range_new;
-    if(work_range_old.first>=work_range_old.second)
-      return -1;
-    work_range_new.second-=1;
-    bool success = false;
-    while(!success) {
-      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
-      success = ( (work_range_new == work_range_old) ||
-                  (work_range_new.first>=work_range_new.second) );
-      work_range_old = work_range_new;
-      work_range_new.second-=1;
-    }
-    if(work_range_old.first<work_range_old.second)
-      return work_range_old.second-1;
-    else
-      return -1;
-  }
-
-  // Reset the steal target
-  inline void reset_steal_target() {
-    m_current_steal_target = (m_pool_rank+1)%m_pool_topo[0];
-    m_stealing = false;
-  }
-
-  // Reset the steal target
-  inline void reset_steal_target(int team_size) {
-    m_current_steal_target = (m_pool_rank_rev+team_size);
-    if(m_current_steal_target>=m_pool_topo[0])
-      m_current_steal_target = 0;//m_pool_topo[0]-1;
-    m_stealing = false;
-  }
-
-  // Get a steal target; start with my-rank + 1 and go round robin, until arriving at this threads rank
-  // Returns -1 fi no active steal target available
-  inline int get_steal_target() {
-    while(( m_pool[m_current_steal_target]->m_work_range.second <=
-            m_pool[m_current_steal_target]->m_work_range.first  ) &&
-          (m_current_steal_target!=m_pool_rank) ) {
-      m_current_steal_target = (m_current_steal_target+1)%m_pool_topo[0];
-    }
-    if(m_current_steal_target == m_pool_rank)
-      return -1;
-    else
-      return m_current_steal_target;
-  }
-
-  inline int get_steal_target(int team_size) {
-
-    while(( m_pool[m_current_steal_target]->m_work_range.second <=
-            m_pool[m_current_steal_target]->m_work_range.first  ) &&
-          (m_current_steal_target!=m_pool_rank_rev) ) {
-      if(m_current_steal_target + team_size < m_pool_topo[0])
-        m_current_steal_target = (m_current_steal_target+team_size);
-      else
-        m_current_steal_target = 0;
-    }
-
-    if(m_current_steal_target == m_pool_rank_rev)
-      return -1;
-    else
-      return m_current_steal_target;
-  }
-
-  inline long steal_work_index (int team_size = 0) {
-    long index = -1;
-    int steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
-    while ( (steal_target != -1) && (index == -1)) {
-      index = m_pool[steal_target]->get_work_index_end();
-      if(index == -1)
-        steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
-    }
-    return index;
-  }
-
-  // Get a work index. Claim from owned range until its exhausted, then steal from other thread
-  inline long get_work_index (int team_size = 0) {
-    long work_index = -1;
-    if(!m_stealing) work_index = get_work_index_begin();
-
-    if( work_index == -1) {
-      memory_fence();
-      m_stealing = true;
-      work_index = steal_work_index(team_size);
-    }
-    m_team_work_index = work_index;
-    memory_fence();
-    return work_index;
-  }
+  HostThreadTeamData * get_thread_data() noexcept
+    { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }

+  inline static
+  HostThreadTeamData * get_thread_data( int i ) noexcept
+    { return m_pool[i]; }
 };

 } // namespace Impl
@ -294,356 +130,6 @@ public:
 namespace Kokkos {
 namespace Impl {

-class OpenMPexecTeamMember {
-public:
-
-  enum { TEAM_REDUCE_SIZE = 512 };
-
-  /** \brief  Thread states for team synchronization */
-  enum { Active = 0 , Rendezvous = 1 };
-
-  typedef Kokkos::OpenMP                         execution_space ;
-  typedef execution_space::scratch_memory_space  scratch_memory_space ;
-
-  Impl::OpenMPexec    & m_exec ;
-  scratch_memory_space  m_team_shared ;
-  int                   m_team_scratch_size[2] ;
-  int                   m_team_base_rev ;
-  int                   m_team_rank_rev ;
-  int                   m_team_rank ;
-  int                   m_team_size ;
-  int                   m_league_rank ;
-  int                   m_league_end ;
-  int                   m_league_size ;
-
-  int                   m_chunk_size;
-  int                   m_league_chunk_end;
-  Impl::OpenMPexec    & m_team_lead_exec ;
-  int                   m_invalid_thread;
-  int                   m_team_alloc;
-
-  // Fan-in team threads, root of the fan-in which does not block returns true
-  inline
-  bool team_fan_in() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
-
-        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
-      }
-
-      if ( m_team_rank_rev ) {
-        m_exec.state_set( Rendezvous );
-        memory_fence();
-        m_exec.state_wait( Rendezvous );
-      }
-
-      return 0 == m_team_rank_rev ;
-    }
-
-  inline
-  void team_fan_out() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
-        m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
-        memory_fence();
-      }
-    }
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& team_shmem() const
-    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    {}
-#else
-    {
-      if ( 1 < m_team_size && !m_invalid_thread) {
-        team_fan_in();
-        team_fan_out();
-      }
-    }
-#endif
-
-  template<class ValueType>
-  KOKKOS_INLINE_FUNCTION
-  void team_broadcast(ValueType& value, const int& thread_id) const
-  {
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { }
-#else
-    // Make sure there is enough scratch space:
-    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
-                         , ValueType , void >::type type ;
-
-    type volatile * const shared_value =
-      ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
-
-    if ( team_rank() == thread_id ) *shared_value = value;
-    memory_fence();
-    team_barrier(); // Wait for 'thread_id' to write
-    value = *shared_value ;
-    team_barrier(); // Wait for team members to read
-#endif
-  }
-
-  template< class ValueType, class JoinOp >
-  KOKKOS_INLINE_FUNCTION ValueType
-    team_reduce( const ValueType & value
-               , const JoinOp & op_in ) const
-  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return ValueType(); }
-  #else
-    {
-      memory_fence();
-      typedef ValueType value_type;
-      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
-  #endif
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      // Make sure there is enough scratch space:
-      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
-                           , value_type , void >::type type ;
-
-      type * const local_value = ((type*) m_exec.scratch_thread());
-
-      // Set this thread's contribution
-      *local_value = value ;
-
-      // Fence to make sure the base team member has access:
-      memory_fence();
-
-      if ( team_fan_in() ) {
-        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
-        type * const team_value  = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
-
-        // Join to the team value:
-        for ( int i = 1 ; i < m_team_size ; ++i ) {
-          op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
-        }
-        memory_fence();
-
-        // The base team member may "lap" the other team members,
-        // copy to their local value before proceeding.
-        for ( int i = 1 ; i < m_team_size ; ++i ) {
-          *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ;
-        }
-
-        // Fence to make sure all team members have access
-        memory_fence();
-      }
-
-      team_fan_out();
-
-      return *((type volatile const *)local_value);
-    }
-#endif
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename ArgType >
-  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return ArgType(); }
-#else
-    {
-      // Make sure there is enough scratch space:
-      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
-
-      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
-
-      *work_value = value ;
-
-      memory_fence();
-
-      if ( team_fan_in() ) {
-        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
-        // m_team_base[0]                 == highest ranking team member
-        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
-        //
-        // 1) copy from lower to higher rank, initialize lowest rank to zero
-        // 2) prefix sum from lowest to highest rank, skipping lowest rank
-
-        type accum = 0 ;
-
-        if ( global_accum ) {
-          for ( int i = m_team_size ; i-- ; ) {
-            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
-            accum += val ;
-          }
-          accum = atomic_fetch_add( global_accum , accum );
-        }
-
-        for ( int i = m_team_size ; i-- ; ) {
-          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
-          const type offset = accum ;
-          accum += val ;
-          val = offset ;
-        }
-
-        memory_fence();
-      }
-
-      team_fan_out();
-
-      return *work_value ;
-    }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
-    { return this-> template team_scan<Type>( value , 0 ); }
-
-  //----------------------------------------
-  // Private for the driver
-
-private:
-
-  typedef execution_space::scratch_memory_space space ;
-
-public:
-
-  template< class ... Properties >
-  inline
-  OpenMPexecTeamMember( Impl::OpenMPexec & exec
-                      , const TeamPolicyInternal< OpenMP, Properties ...> & team
-                      , const int shmem_size_L1
-                      , const int shmem_size_L2
-                      )
-    : m_exec( exec )
-    , m_team_shared(0,0)
-    , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
-    , m_team_base_rev(0)
-    , m_team_rank_rev(0)
-    , m_team_rank(0)
-    , m_team_size( team.team_size() )
-    , m_league_rank(0)
-    , m_league_end(0)
-    , m_league_size( team.league_size() )
-    , m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
-    , m_league_chunk_end(0)
-    , m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
-    , m_team_alloc( team.team_alloc())
-    {
-      const int pool_rank_rev        = m_exec.pool_rank_rev();
-      const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
-      const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
-      const int pool_num_teams       = OpenMP::thread_pool_size(0)/team.team_alloc();
-      const int chunks_per_team      = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
-            int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
-            int league_iter_begin    = league_iter_end - chunks_per_team * m_chunk_size;
-      if (league_iter_begin < 0)     league_iter_begin = 0;
-      if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
-
-      if ((team.team_alloc()>m_team_size)?
-          (pool_team_rank_rev >= m_team_size):
-          (m_exec.pool_size() - pool_num_teams*m_team_size > m_exec.pool_rank())
-         )
-        m_invalid_thread = 1;
-      else
-        m_invalid_thread = 0;
-
-      m_team_rank_rev  = pool_team_rank_rev ;
-      if ( pool_team_rank_rev < m_team_size && !m_invalid_thread ) {
-        m_team_base_rev  = team.team_alloc() * pool_league_rank_rev ;
-        m_team_rank_rev  = pool_team_rank_rev ;
-        m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
-        m_league_end     = league_iter_end ;
-        m_league_rank    = league_iter_begin ;
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
-                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
-                                               0 );
-      }
-
-      if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
-        m_exec.set_work_range(m_league_rank,m_league_end,m_chunk_size);
-        m_exec.reset_steal_target(m_team_size);
-      }
-    }
-
-  bool valid_static() const
-    {
-      return m_league_rank < m_league_end ;
-    }
-
-  void next_static()
-    {
-      if ( m_league_rank < m_league_end ) {
-        team_barrier();
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
-                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
-                                               0);
-      }
-      m_league_rank++;
-    }
-
-  bool valid_dynamic() {
-    if(m_invalid_thread)
-      return false;
-    if ((m_league_rank < m_league_chunk_end) && (m_league_rank < m_league_size)) {
-      return true;
-    }
-
-    if (  m_team_rank_rev == 0 ) {
-      m_team_lead_exec.get_work_index(m_team_alloc);
-    }
-    team_barrier();
-
-    long work_index = m_team_lead_exec.team_work_index();
-
-    m_league_rank = work_index * m_chunk_size;
-    m_league_chunk_end = (work_index +1 ) * m_chunk_size;
-
-    if(m_league_chunk_end > m_league_size) m_league_chunk_end = m_league_size;
-
-    if(m_league_rank>=0)
-      return true;
-    return false;
-  }
-
-  void next_dynamic() {
-    if(m_invalid_thread)
-      return;
-
-    if ( m_league_rank < m_league_chunk_end ) {
-      team_barrier();
-      new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
-                                           ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
-                                             0);
-    }
-    m_league_rank++;
-  }
-
-  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
-};
-
 template< class ... Properties >
 class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
 {
@ -671,8 +157,11 @@ public:

  template< class FunctorType >
  inline static
-  int team_size_max( const FunctorType & )
-    { return traits::execution_space::thread_pool_size(1); }
+  int team_size_max( const FunctorType & ) {
+      int pool_size = traits::execution_space::thread_pool_size(1);
+      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      return pool_size<max_host_team_size?pool_size:max_host_team_size;
+    }

  template< class FunctorType >
  inline static
@ -702,7 +191,8 @@ private:
                  , const int team_size_request )
    {
      const int pool_size  = traits::execution_space::thread_pool_size(0);
-      const int team_max   = traits::execution_space::thread_pool_size(1);
+      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
      const int team_grain = traits::execution_space::thread_pool_size(2);

      m_league_size = league_size_request ;
@ -823,7 +313,7 @@ private:
  }

 public:
-  typedef Impl::OpenMPexecTeamMember member_type ;
+  typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
 };
 } // namespace Impl

@ -850,216 +340,6 @@ int OpenMP::thread_pool_rank()
 #endif
 }

-template< typename iType >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >
-TeamThreadRange( const Impl::OpenMPexecTeamMember& thread, const iType& count ) {
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >( thread, count );
-}
-
-template< typename iType1, typename iType2 >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::OpenMPexecTeamMember >
-TeamThreadRange( const Impl::OpenMPexecTeamMember& thread, const iType1& begin, const iType2& end ) {
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >( thread, iType(begin), iType(end) );
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >
-ThreadVectorRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) {
-  return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread);
-}
-
 } // namespace Kokkos

-namespace Kokkos {
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
-                     const Lambda & lambda, ValueType& result) {
-
-  result = ValueType();
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-
-  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
-                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-
-  init_result = loop_boundaries.thread.team_reduce(result,join);
-}
-
-} //namespace Kokkos
-
-namespace Kokkos {
-/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
- * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-    loop_boundaries, const Lambda& lambda) {
-  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-  #pragma ivdep
-  #endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
-  result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  init_result = result;
-}
-
-/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
- *          for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
- * Depending on the target execution space the operator might be called twice: once with final=false
- * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
- * "i" needs to be added to val no matter whether final==true or not. In a serial execution
- * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
- * to the final sum value over all vector lanes.
- * This functionality requires C++11 support.*/
-template< typename iType, class FunctorType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const FunctorType & lambda) {
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
-  typedef typename ValueTraits::value_type value_type ;
-
-  value_type scan_val = value_type();
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i,scan_val,true);
-  }
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
-  if(single_struct.team_member.team_rank()==0) lambda();
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
-  if(single_struct.team_member.team_rank()==0) {
-    lambda(val);
-  }
-  single_struct.team_member.team_broadcast(val,0);
-}
-}
-
 #endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
@ -1,511 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_ENABLE_QTHREAD )
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <sstream>
-#include <utility>
-#include <Kokkos_Qthread.hpp>
-#include <Kokkos_Atomic.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-// Defines to enable experimental Qthread functionality
-
-#define QTHREAD_LOCAL_PRIORITY
-#define CLONED_TASKS
-
-#include <qthread/qthread.h>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
-
-/** s_exec is indexed by the reverse rank of the workers
- *  for faster fan-in / fan-out lookups
- *  [ n - 1 , n - 2 , ... , 0 ]
- */
-QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
-
-int  s_number_shepherds            = 0 ;
-int  s_number_workers_per_shepherd = 0 ;
-int  s_number_workers              = 0 ;
-
-inline
-QthreadExec ** worker_exec()
-{
-  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
-}
-
-const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
-
-int s_worker_reduce_end   = 0 ; /* End of worker reduction memory    */
-int s_worker_shared_end   = 0 ; /* Total of worker scratch memory    */
-int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
-
-QthreadExecFunctionPointer volatile s_active_function = 0 ;
-const void               * volatile s_active_function_arg = 0 ;
-
-} /* namespace */
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-int Qthread::is_initialized()
-{
-  return Impl::s_number_workers != 0 ;
-}
-
-int Qthread::concurrency()
-{
-  return Impl::s_number_workers_per_shepherd ;
-}
-
-int Qthread::in_parallel()
-{
-  return Impl::s_active_function != 0 ;
-}
-
-void Qthread::initialize( int thread_count )
-{
-  // Environment variable: QTHREAD_NUM_SHEPHERDS
-  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
-  // Environment variable: QTHREAD_HWPAR
-
-  {
-    char buffer[256];
-    snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
-    putenv(buffer);
-  }
-
-  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
-                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
-                       ( thread_count    == qthread_num_workers() );
-
-  bool ok_symmetry = true ;
-
-  if ( ok_init ) {
-    Impl::s_number_shepherds            = qthread_num_shepherds();
-    Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
-    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
-
-    for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
-      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
-    }
-  }
-
-  if ( ! ok_init || ! ok_symmetry ) {
-    std::ostringstream msg ;
-
-    msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
-    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
-    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
-    msg << " : qthread_num_workers = " << qthread_num_workers();
-
-    if ( ! ok_symmetry ) {
-      msg << " : qthread_num_workers_local = {" ;
-      for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
-        msg << " " << qthread_num_workers_local(i) ;
-      }
-      msg << " }" ;
-    }
-
-    Impl::s_number_workers   = 0 ;
-    Impl::s_number_shepherds = 0 ;
-    Impl::s_number_workers_per_shepherd = 0 ;
-
-    if ( ok_init ) { qthread_finalize(); }
-
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-
-  Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
-
-  // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_array_host_space();
-
-}
-
-void Qthread::finalize()
-{
-  Impl::QthreadExec::clear_workers();
-
-  if ( Impl::s_number_workers ) {
-    qthread_finalize();
-  }
-
-  Impl::s_number_workers    = 0 ;
-  Impl::s_number_shepherds  = 0 ;
-  Impl::s_number_workers_per_shepherd = 0 ;
-}
-
-void Qthread::print_configuration( std::ostream & s , const bool detail )
-{
-  s << "Kokkos::Qthread {"
-    << " num_shepherds(" << Impl::s_number_shepherds << ")"
-    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
-    << " }" << std::endl ;
-}
-
-Qthread & Qthread::instance( int )
-{
-  static Qthread q ;
-  return q ;
-}
-
-void Qthread::fence()
-{
-}
-
-int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
-int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-aligned_t driver_exec_all( void * arg )
-{
-  QthreadExec & exec = **worker_exec();
-
-  (*s_active_function)( exec , s_active_function_arg );
-
-/*
-  fprintf( stdout
-         , "QthreadExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
-         , exec.worker_rank()
-         , exec.worker_size()
-         , exec.shepherd_rank()
-         , exec.shepherd_size()
-         , exec.shepherd_worker_rank()
-         , exec.shepherd_worker_size()
-         );
-  fflush(stdout);
-*/
-
-  return 0 ;
-}
-
-aligned_t driver_resize_worker_scratch( void * arg )
-{
-  static volatile int lock_begin = 0 ;
-  static volatile int lock_end   = 0 ;
-
-  QthreadExec ** const exec = worker_exec();
-
-  //----------------------------------------
-  // Serialize allocation for thread safety
-
-  while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
-
-  const bool ok = 0 == *exec ;
-
-  if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
-
-  lock_begin = 0 ; // release lock
-
-  if ( ok ) { new( *exec ) QthreadExec(); }
-
-  //----------------------------------------
-  // Wait for all calls to complete to insure that each worker has executed.
-
-  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
-
-  while ( lock_end );
-
-/*
-  fprintf( stdout
-         , "QthreadExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
-         , (**exec).worker_rank()
-         , (**exec).worker_size()
-         , (**exec).shepherd_rank()
-         , (**exec).shepherd_size()
-         , (**exec).shepherd_worker_rank()
-         , (**exec).shepherd_worker_size()
-         );
-  fflush(stdout);
-*/
-
-  //----------------------------------------
-
-  if ( ! ok ) {
-    fprintf( stderr , "Kokkos::QthreadExec resize failed\n" );
-    fflush( stderr );
-  }
-
-  return 0 ;
-}
-
-void verify_is_process( const char * const label , bool not_active = false )
-{
-  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
-  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
-
-  if ( not_process || is_active ) {
-    std::string msg( label );
-    msg.append( " : FAILED" );
-    if ( not_process ) msg.append(" : not called by main process");
-    if ( is_active )   msg.append(" : parallel execution in progress");
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-}
-
-}
-
-int QthreadExec::worker_per_shepherd()
-{
-  return s_number_workers_per_shepherd ;
-}
-
-QthreadExec::QthreadExec()
-{
-  const int shepherd_rank        = qthread_shep();
-  const int shepherd_worker_rank = qthread_worker_local(NULL);
-  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
-
-  m_worker_base          = s_exec ;
-  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
-  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size ;
-  m_reduce_end           = s_worker_reduce_end ;
-  m_shepherd_rank        = shepherd_rank ;
-  m_shepherd_size        = s_number_shepherds ;
-  m_shepherd_worker_rank = shepherd_worker_rank ;
-  m_shepherd_worker_size = s_number_workers_per_shepherd ;
-  m_worker_rank          = worker_rank ;
-  m_worker_size          = s_number_workers ;
-  m_worker_state         = QthreadExec::Active ;
-}
-
-void QthreadExec::clear_workers()
-{
-  for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
-    QthreadExec * const exec = s_exec[iwork] ;
-    s_exec[iwork] = 0 ;
-    free( exec );
-  }
-}
-
-void QthreadExec::shared_reset( Qthread::scratch_memory_space & space )
-{
-  new( & space )
-    Qthread::scratch_memory_space(
-      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin ,
-      s_worker_shared_end - s_worker_shared_begin
-    );
-}
-
-void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
-{
-  const int exec_all_reduce_alloc = align_alloc( reduce_size );
-  const int shepherd_scan_alloc   = align_alloc( 8 );
-  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
-
-  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
-       s_worker_shared_end < shepherd_shared_end ) {
-
-/*
-  fprintf( stdout , "QthreadExec::resize\n");
-  fflush(stdout);
-*/
-
-    // Clear current worker memory before allocating new worker memory
-    clear_workers();
-
-    // Increase the buffers to an aligned allocation
-    s_worker_reduce_end   = exec_all_reduce_alloc ;
-    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
-    s_worker_shared_end   = shepherd_shared_end ;
-
-    // Need to query which shepherd this main 'process' is running...
- 
-    const int main_shep = qthread_shep();
-
-    // Have each worker resize its memory for proper first-touch
-#if 0
-    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-    for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
-      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
-    }}
-#else
-    // If this function is used before the 'qthread.task_policy' unit test
-    // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
-    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
-
-      if ( num_clone ) {
-        const int ret = qthread_fork_clones_to_local_priority
-          ( driver_resize_worker_scratch   /* function */
-          , NULL                           /* function data block */
-          , NULL                           /* pointer to return value feb */
-          , jshep                          /* shepherd number */
-          , num_clone - 1                  /* number of instances - 1 */
-          );
-
-        assert(ret == QTHREAD_SUCCESS);
-      }
-    }
-#endif
-
-    driver_resize_worker_scratch( NULL );
-
-    // Verify all workers allocated
-
-    bool ok = true ;
-    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
-
-    if ( ! ok ) {
-      std::ostringstream msg ;
-      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
-      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
-         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
-      }
-      msg << " }" ;
-      Kokkos::Impl::throw_runtime_exception( msg.str() );
-    }
-  }
-}
-
-void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
-{
-  verify_is_process("QthreadExec::exec_all(...)",true);
-
-/*
-  fprintf( stdout , "QthreadExec::exec_all\n");
-  fflush(stdout);
-*/
-
-  s_active_function     = func ;
-  s_active_function_arg = arg ;
-
-  // Need to query which shepherd this main 'process' is running...
- 
-  const int main_shep = qthread_shep();
-
-#if 0
-  for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-  for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
-    qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
-  }}
-#else
-  // If this function is used before the 'qthread.task_policy' unit test
-  // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
-  for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
-
-    if ( num_clone ) {
-      const int ret = qthread_fork_clones_to_local_priority
-        ( driver_exec_all   /* function */
-        , NULL              /* function data block */
-        , NULL              /* pointer to return value feb */
-        , jshep             /* shepherd number */
-        , num_clone - 1     /* number of instances - 1 */
-        );
-
-      assert(ret == QTHREAD_SUCCESS);
-    }
-  }
-#endif
-
-  driver_exec_all( NULL );
-
-  s_active_function     = 0 ;
-  s_active_function_arg = 0 ;
-}
-
-void * QthreadExec::exec_all_reduce_result()
-{
-  return s_exec[0]->m_scratch_alloc ;
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-namespace Impl {
-
-QthreadTeamPolicyMember::QthreadTeamPolicyMember()
-  : m_exec( **worker_exec() )
-  , m_team_shared(0,0)
-  , m_team_size( 1 )
-  , m_team_rank( 0 )
-  , m_league_size(1)
-  , m_league_end(1)
-  , m_league_rank(0)
-{
-  m_exec.shared_reset( m_team_shared );
-}
-
-QthreadTeamPolicyMember::QthreadTeamPolicyMember( const QthreadTeamPolicyMember::TaskTeam & )
-  : m_exec( **worker_exec() )
-  , m_team_shared(0,0)
-  , m_team_size( s_number_workers_per_shepherd )
-  , m_team_rank( m_exec.shepherd_worker_rank() )
-  , m_league_size(1)
-  , m_league_end(1)
-  , m_league_rank(0)
-{
-  m_exec.shared_reset( m_team_shared );
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-#endif /* #if defined( KOKKOS_ENABLE_QTHREAD ) */
-
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
@ -1,620 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_QTHREADEXEC_HPP
-#define KOKKOS_QTHREADEXEC_HPP
-
-#include <impl/Kokkos_spinwait.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-class QthreadExec ;
-
-typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
-
-class QthreadExec {
-private:
-
-  enum { Inactive = 0 , Active = 1 };
-
-  const QthreadExec * const * m_worker_base ;
-  const QthreadExec * const * m_shepherd_base ;
-
-  void  * m_scratch_alloc ;  ///< Scratch memory [ reduce , team , shared ]
-  int     m_reduce_end ;     ///< End of scratch reduction memory
-
-  int     m_shepherd_rank ;
-  int     m_shepherd_size ;
-
-  int     m_shepherd_worker_rank ;
-  int     m_shepherd_worker_size ;
-
-  /*
-   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
-   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
-   */
-  int     m_worker_rank ;
-  int     m_worker_size ;
-
-  int mutable volatile m_worker_state ;
-
-
-  friend class Kokkos::Qthread ;
-
-  ~QthreadExec();
-  QthreadExec( const QthreadExec & );
-  QthreadExec & operator = ( const QthreadExec & );
-
-public:
-
-  QthreadExec();
-
-  /** Execute the input function on all available Qthread workers */
-  static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
-
-  //----------------------------------------
-  /** Barrier across all workers participating in the 'exec_all' */
-  void exec_all_barrier() const
-    {
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  /** Barrier across workers within the shepherd with rank < team_rank */
-  void shepherd_barrier( const int team_size ) const
-    {
-      if ( m_shepherd_worker_rank < team_size ) {
-
-        const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-        int n , j ;
-
-        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-          Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-        }
-
-        if ( rev_rank ) {
-          m_worker_state = QthreadExec::Inactive ;
-          Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-        }
-
-        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-          m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-        }
-      }
-    }
-
-  //----------------------------------------
-  /** Reduce across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ReducerType , class ArgTag >
-  inline
-  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
-    {
-      typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
-      typedef typename ReducerConditional::type ReducerTypeFwd;
-      typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
-
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        const QthreadExec & fan = *m_worker_base[j];
-
-        Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
-
-        ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  //----------------------------------------
-  /** Scall across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ArgTag >
-  inline
-  void exec_all_scan( const FunctorType & func ) const
-    {
-      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > ValueInit ;
-      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > ValueJoin ;
-      typedef Kokkos::Impl::FunctorValueOps<    FunctorType , ArgTag > ValueOps ;
-
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_worker_base[0] is the
-        // highest ranking thread.
-
-        // Copy from lower ranking to higher ranking worker.
-        for ( int i = 1 ; i < m_worker_size ; ++i ) {
-          ValueOps::copy( func
-                        , m_worker_base[i-1]->m_scratch_alloc
-                        , m_worker_base[i]->m_scratch_alloc
-                        );
-        }
-
-        ValueInit::init( func , m_worker_base[m_worker_size-1]->m_scratch_alloc );
-
-        // Join from lower ranking to higher ranking worker.
-        // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
-        for ( int i = m_worker_size - 1 ; --i > 0 ; ) {
-          ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
-        }
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  //----------------------------------------
-
-  template< class Type>
-  inline
-  volatile Type * shepherd_team_scratch_value() const
-    { return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
-
-  template< class Type >
-  inline
-  void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const
-    {
-      if ( m_shepherd_base ) {
-        Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; }
-        memory_fence();
-        shepherd_barrier( team_size );
-        value = *shared_value ;
-      }
-    }
-
-  template< class Type >
-  inline
-  Type shepherd_reduce( const int team_size , const Type & value ) const
-    {
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < n ; ++i ) {
-          accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-        }
-        for ( int i = 1 ; i < n ; ++i ) {
-          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  template< class JoinOp >
-  inline
-  typename JoinOp::value_type
-    shepherd_reduce( const int team_size
-                   , const typename JoinOp::value_type & value
-                   , const JoinOp & op ) const
-    {
-      typedef typename JoinOp::value_type Type ;
-
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < team_size ; ++i ) {
-          op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
-        }
-        for ( int i = 1 ; i < team_size ; ++i ) {
-          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  template< class Type >
-  inline
-  Type shepherd_scan( const int team_size
-                    , const Type & value
-                    ,       Type * const global_value = 0 ) const
-    {
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_shepherd_base[0] is the
-        // highest ranking thread.
-
-        // Copy from lower ranking to higher ranking worker.
-
-        Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < team_size ; ++i ) {
-          const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-          accum += tmp ;
-          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
-        }
-
-        * m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
-          global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
-
-        // Join from lower ranking to higher ranking worker.
-        for ( int i = team_size ; --i ; ) {
-          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  //----------------------------------------
-
-  static inline
-  int align_alloc( int size )
-    {
-      enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
-      enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
-      return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
-    }
-
-  void shared_reset( Qthread::scratch_memory_space & );
-
-  void * exec_all_reduce_value() const { return m_scratch_alloc ; }
-
-  static void * exec_all_reduce_result();
-
-  static void resize_worker_scratch( const int reduce_size , const int shared_size );
-  static void clear_workers();
-
-  //----------------------------------------
-
-  inline int worker_rank() const { return m_worker_rank ; }
-  inline int worker_size() const { return m_worker_size ; }
-  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
-  inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
-  inline int shepherd_rank() const { return m_shepherd_rank ; }
-  inline int shepherd_size() const { return m_shepherd_size ; }
-
-  static int worker_per_shepherd();
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-class QthreadTeamPolicyMember {
-private:
-
-  typedef Kokkos::Qthread                        execution_space ;
-  typedef execution_space::scratch_memory_space  scratch_memory_space ;
-
-
-        Impl::QthreadExec   & m_exec ;
-  scratch_memory_space        m_team_shared ;
-  const int                   m_team_size ;
-  const int                   m_team_rank ;
-  const int                   m_league_size ;
-  const int                   m_league_end ;
-        int                   m_league_rank ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_shmem() const { return m_team_shared ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    {}
-#else
-    { m_exec.shepherd_barrier( m_team_size ); }
-#endif
-
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value , int rank ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_broadcast<Type>( value , m_team_size , rank ); }
-#endif
-
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_reduce<Type>( m_team_size , value ); }
-#endif
-
-  template< typename JoinOp >
-  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
-    team_reduce( const typename JoinOp::value_type & value
-               , const JoinOp & op ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return typename JoinOp::value_type(); }
-#else
-    { return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
-#endif
-
-  //----------------------------------------
-  // Private driver for task-team parallel
-
-  struct TaskTeam {};
-
-  QthreadTeamPolicyMember();
-  explicit QthreadTeamPolicyMember( const TaskTeam & );
-
-  //----------------------------------------
-  // Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
-
-  // Initialize
-  template< class ... Properties >
-  QthreadTeamPolicyMember( Impl::QthreadExec & exec
-                         , const Kokkos::Impl::TeamPolicyInternal<Qthread,Properties...> & team )
-    : m_exec( exec )
-    , m_team_shared(0,0)
-    , m_team_size(   team.m_team_size )
-    , m_team_rank(   exec.shepherd_worker_rank() )
-    , m_league_size( team.m_league_size )
-    , m_league_end(  team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
-    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
-  {
-    m_exec.shared_reset( m_team_shared );
-  }
-
-  // Continue
-  operator bool () const { return m_league_rank < m_league_end ; }
-
-  // iterate
-  void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); }
-};
-
-
-template< class ... Properties >
-class TeamPolicyInternal< Kokkos::Qthread , Properties ... >
-  : public PolicyTraits< Properties... >
-{
-private:
-
-  const int m_league_size ;
-  const int m_team_size ;
-  const int m_shepherd_iter ;
-
-public:
-
-  //! Tag this class as a kokkos execution policy
-  typedef TeamPolicyInternal  execution_policy ;
-  typedef Qthread             execution_space ;
-  typedef PolicyTraits< Properties ... >  traits ;
-
-  //----------------------------------------
-
-  template< class FunctorType >
-  inline static
-  int team_size_max( const FunctorType & )
-    { return Qthread::instance().shepherd_worker_size(); }
-
-  template< class FunctorType >
-  static int team_size_recommended( const FunctorType & f )
-    { return team_size_max( f ); }
-
-  template< class FunctorType >
-  inline static
-  int team_size_recommended( const FunctorType & f , const int& )
-    { return team_size_max( f ); }
-
-  //----------------------------------------
-
-  inline int team_size()   const { return m_team_size ; }
-  inline int league_size() const { return m_league_size ; }
-
-  // One active team per shepherd
-  TeamPolicyInternal( Kokkos::Qthread & q
-                    , const int league_size
-                    , const int team_size
-                    , const int /* vector_length */ = 0
-                    )
-    : m_league_size( league_size )
-    , m_team_size( team_size < q.shepherd_worker_size()
-                 ? team_size : q.shepherd_worker_size() )
-    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
-    {
-    }
-
-  // One active team per shepherd
-  TeamPolicyInternal( const int league_size
-                    , const int team_size
-                    , const int /* vector_length */ = 0
-                    )
-    : m_league_size( league_size )
-    , m_team_size( team_size < Qthread::instance().shepherd_worker_size()
-                 ? team_size : Qthread::instance().shepherd_worker_size() )
-    , m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
-    {
-    }
-
-  typedef Impl::QthreadTeamPolicyMember member_type ;
-
-  friend class Impl::QthreadTeamPolicyMember ;
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_QTHREADEXEC_HPP */
-
--- a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
@ -0,0 +1,519 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ENABLE_QTHREADS )
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <utility>
+
+#include <Kokkos_Qthreads.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+// Defines to enable experimental Qthreads functionality.
+//#define QTHREAD_LOCAL_PRIORITY
+//#define CLONED_TASKS
+
+//#include <qthread.h>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+namespace {
+
+enum { MAXIMUM_QTHREADS_WORKERS = 1024 };
+
+/** s_exec is indexed by the reverse rank of the workers
+ *  for faster fan-in / fan-out lookups
+ *  [ n - 1, n - 2, ..., 0 ]
+ */
+QthreadsExec * s_exec[ MAXIMUM_QTHREADS_WORKERS ];
+
+int  s_number_shepherds            = 0;
+int  s_number_workers_per_shepherd = 0;
+int  s_number_workers              = 0;
+
+inline
+QthreadsExec ** worker_exec()
+{
+  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local( NULL ) + 1 );
+}
+
+const int s_base_size = QthreadsExec::align_alloc( sizeof(QthreadsExec) );
+
+int s_worker_reduce_end   = 0;  // End of worker reduction memory.
+int s_worker_shared_end   = 0;  // Total of worker scratch memory.
+int s_worker_shared_begin = 0;  // Beginning of worker shared memory.
+
+QthreadsExecFunctionPointer volatile s_active_function     = 0;
+const void                * volatile s_active_function_arg = 0;
+
+} // namespace
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+int Qthreads::is_initialized()
+{
+  return Impl::s_number_workers != 0;
+}
+
+int Qthreads::concurrency()
+{
+  return Impl::s_number_workers_per_shepherd;
+}
+
+int Qthreads::in_parallel()
+{
+  return Impl::s_active_function != 0;
+}
+
+void Qthreads::initialize( int thread_count )
+{
+  // Environment variable: QTHREAD_NUM_SHEPHERDS
+  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
+  // Environment variable: QTHREAD_HWPAR
+
+  {
+    char buffer[256];
+    snprintf( buffer, sizeof(buffer), "QTHREAD_HWPAR=%d", thread_count );
+    putenv( buffer );
+  }
+
+  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
+                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local( NO_SHEPHERD ) ) &&
+                       ( thread_count    == qthread_num_workers() );
+
+  bool ok_symmetry = true;
+
+  if ( ok_init ) {
+    Impl::s_number_shepherds            = qthread_num_shepherds();
+    Impl::s_number_workers_per_shepherd = qthread_num_workers_local( NO_SHEPHERD );
+    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd;
+
+    for ( int i = 0; ok_symmetry && i < Impl::s_number_shepherds; ++i ) {
+      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local( i ) );
+    }
+  }
+
+  if ( ! ok_init || ! ok_symmetry ) {
+    std::ostringstream msg;
+
+    msg << "Kokkos::Qthreads::initialize(" << thread_count << ") FAILED";
+    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
+    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local( NO_SHEPHERD );
+    msg << " : qthread_num_workers = " << qthread_num_workers();
+
+    if ( ! ok_symmetry ) {
+      msg << " : qthread_num_workers_local = {";
+      for ( int i = 0; i < Impl::s_number_shepherds; ++i ) {
+        msg << " " << qthread_num_workers_local( i );
+      }
+      msg << " }";
+    }
+
+    Impl::s_number_workers              = 0;
+    Impl::s_number_shepherds            = 0;
+    Impl::s_number_workers_per_shepherd = 0;
+
+    if ( ok_init ) { qthread_finalize(); }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  Impl::QthreadsExec::resize_worker_scratch( 256, 256 );
+
+  // Init the array for used for arbitrarily sized atomics.
+  Impl::init_lock_array_host_space();
+
+}
+
+void Qthreads::finalize()
+{
+  Impl::QthreadsExec::clear_workers();
+
+  if ( Impl::s_number_workers ) {
+    qthread_finalize();
+  }
+
+  Impl::s_number_workers              = 0;
+  Impl::s_number_shepherds            = 0;
+  Impl::s_number_workers_per_shepherd = 0;
+}
+
+void Qthreads::print_configuration( std::ostream & s, const bool detail )
+{
+  s << "Kokkos::Qthreads {"
+    << " num_shepherds(" << Impl::s_number_shepherds << ")"
+    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
+    << " }" << std::endl;
+}
+
+Qthreads & Qthreads::instance( int )
+{
+  static Qthreads q;
+  return q;
+}
+
+void Qthreads::fence()
+{
+}
+
+int Qthreads::shepherd_size() const { return Impl::s_number_shepherds; }
+int Qthreads::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd; }
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+namespace {
+
+aligned_t driver_exec_all( void * arg )
+{
+  QthreadsExec & exec = **worker_exec();
+
+  (*s_active_function)( exec, s_active_function_arg );
+
+/*
+  fprintf( stdout
+         , "QthreadsExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , exec.worker_rank()
+         , exec.worker_size()
+         , exec.shepherd_rank()
+         , exec.shepherd_size()
+         , exec.shepherd_worker_rank()
+         , exec.shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  return 0;
+}
+
+aligned_t driver_resize_worker_scratch( void * arg )
+{
+  static volatile int lock_begin = 0;
+  static volatile int lock_end   = 0;
+
+  QthreadsExec ** const exec = worker_exec();
+
+  //----------------------------------------
+  // Serialize allocation for thread safety.
+
+  while ( ! atomic_compare_exchange_strong( & lock_begin, 0, 1 ) ); // Spin wait to claim lock.
+
+  const bool ok = 0 == *exec;
+
+  if ( ok ) { *exec = (QthreadsExec *) malloc( s_base_size + s_worker_shared_end ); }
+
+  lock_begin = 0; // Release lock.
+
+  if ( ok ) { new( *exec ) QthreadsExec(); }
+
+  //----------------------------------------
+  // Wait for all calls to complete to insure that each worker has executed.
+
+  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end, 1 ) ) { lock_end = 0; }
+
+  while ( lock_end );
+
+/*
+  fprintf( stdout
+         , "QthreadsExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , (**exec).worker_rank()
+         , (**exec).worker_size()
+         , (**exec).shepherd_rank()
+         , (**exec).shepherd_size()
+         , (**exec).shepherd_worker_rank()
+         , (**exec).shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  //----------------------------------------
+
+  if ( ! ok ) {
+    fprintf( stderr, "Kokkos::QthreadsExec resize failed\n" );
+    fflush( stderr );
+  }
+
+  return 0;
+}
+
+void verify_is_process( const char * const label, bool not_active = false )
+{
+  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local( NULL );
+  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
+
+  if ( not_process || is_active ) {
+    std::string msg( label );
+    msg.append( " : FAILED" );
+    if ( not_process ) msg.append(" : not called by main process");
+    if ( is_active )   msg.append(" : parallel execution in progress");
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+} // namespace
+
+int QthreadsExec::worker_per_shepherd()
+{
+  return s_number_workers_per_shepherd;
+}
+
+QthreadsExec::QthreadsExec()
+{
+  const int shepherd_rank        = qthread_shep();
+  const int shepherd_worker_rank = qthread_worker_local( NULL );
+  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank;
+
+  m_worker_base          = s_exec;
+  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
+  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size;
+  m_reduce_end           = s_worker_reduce_end;
+  m_shepherd_rank        = shepherd_rank;
+  m_shepherd_size        = s_number_shepherds;
+  m_shepherd_worker_rank = shepherd_worker_rank;
+  m_shepherd_worker_size = s_number_workers_per_shepherd;
+  m_worker_rank          = worker_rank;
+  m_worker_size          = s_number_workers;
+  m_worker_state         = QthreadsExec::Active;
+}
+
+void QthreadsExec::clear_workers()
+{
+  for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
+    QthreadsExec * const exec = s_exec[iwork];
+    s_exec[iwork] = 0;
+    free( exec );
+  }
+}
+
+void QthreadsExec::shared_reset( Qthreads::scratch_memory_space & space )
+{
+  new( & space )
+    Qthreads::scratch_memory_space(
+      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin,
+      s_worker_shared_end - s_worker_shared_begin
+    );
+}
+
+void QthreadsExec::resize_worker_scratch( const int reduce_size, const int shared_size )
+{
+  const int exec_all_reduce_alloc = align_alloc( reduce_size );
+  const int shepherd_scan_alloc   = align_alloc( 8 );
+  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
+
+  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
+       s_worker_shared_end < shepherd_shared_end ) {
+
+/*
+  fprintf( stdout, "QthreadsExec::resize\n");
+  fflush(stdout);
+*/
+
+    // Clear current worker memory before allocating new worker memory.
+    clear_workers();
+
+    // Increase the buffers to an aligned allocation.
+    s_worker_reduce_end   = exec_all_reduce_alloc;
+    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc;
+    s_worker_shared_end   = shepherd_shared_end;
+
+    // Need to query which shepherd this main 'process' is running.
+
+    const int main_shep = qthread_shep();
+
+    // Have each worker resize its memory for proper first-touch.
+#if 0
+    for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+      for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i ) {
+        qthread_fork_to( driver_resize_worker_scratch, NULL, NULL, jshep );
+      }
+    }
+#else
+    // If this function is used before the 'qthreads.task_policy' unit test,
+    // the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
+    for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
+
+      if ( num_clone ) {
+        const int ret = qthread_fork_clones_to_local_priority
+          ( driver_resize_worker_scratch   // Function
+          , NULL                           // Function data block
+          , NULL                           // Pointer to return value feb
+          , jshep                          // Shepherd number
+          , num_clone - 1                  // Number of instances - 1
+          );
+
+        assert( ret == QTHREAD_SUCCESS );
+      }
+    }
+#endif
+
+    driver_resize_worker_scratch( NULL );
+
+    // Verify all workers allocated.
+
+    bool ok = true;
+    for ( int iwork = 0; ok && iwork < s_number_workers; ++iwork ) { ok = 0 != s_exec[iwork]; }
+
+    if ( ! ok ) {
+      std::ostringstream msg;
+      msg << "Kokkos::Impl::QthreadsExec::resize : FAILED for workers {";
+      for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
+         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
+      }
+      msg << " }";
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+}
+
+void QthreadsExec::exec_all( Qthreads &, QthreadsExecFunctionPointer func, const void * arg )
+{
+  verify_is_process("QthreadsExec::exec_all(...)",true);
+
+/*
+  fprintf( stdout, "QthreadsExec::exec_all\n");
+  fflush(stdout);
+*/
+
+  s_active_function     = func;
+  s_active_function_arg = arg;
+
+  // Need to query which shepherd this main 'process' is running.
+
+  const int main_shep = qthread_shep();
+
+#if 0
+  for ( int jshep = 0, iwork = 0; jshep < s_number_shepherds; ++jshep ) {
+    for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i, ++iwork ) {
+      qthread_fork_to( driver_exec_all, NULL, NULL, jshep );
+    }
+  }
+#else
+  // If this function is used before the 'qthreads.task_policy' unit test,
+  // the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
+  for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
+
+    if ( num_clone ) {
+      const int ret = qthread_fork_clones_to_local_priority
+        ( driver_exec_all   // Function
+        , NULL              // Function data block
+        , NULL              // Pointer to return value feb
+        , jshep             // Shepherd number
+        , num_clone - 1     // Number of instances - 1
+        );
+
+      assert(ret == QTHREAD_SUCCESS);
+    }
+  }
+#endif
+
+  driver_exec_all( NULL );
+
+  s_active_function     = 0;
+  s_active_function_arg = 0;
+}
+
+void * QthreadsExec::exec_all_reduce_result()
+{
+  return s_exec[0]->m_scratch_alloc;
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+QthreadsTeamPolicyMember::QthreadsTeamPolicyMember()
+  : m_exec( **worker_exec() )
+  , m_team_shared( 0, 0 )
+  , m_team_size( 1 )
+  , m_team_rank( 0 )
+  , m_league_size( 1 )
+  , m_league_end( 1 )
+  , m_league_rank( 0 )
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+QthreadsTeamPolicyMember::QthreadsTeamPolicyMember( const QthreadsTeamPolicyMember::TaskTeam & )
+  : m_exec( **worker_exec() )
+  , m_team_shared( 0, 0 )
+  , m_team_size( s_number_workers_per_shepherd )
+  , m_team_rank( m_exec.shepherd_worker_rank() )
+  , m_league_size( 1 )
+  , m_league_end( 1 )
+  , m_league_rank( 0 )
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
--- a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
@ -0,0 +1,640 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADSEXEC_HPP
+#define KOKKOS_QTHREADSEXEC_HPP
+
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+class QthreadsExec;
+
+typedef void (*QthreadsExecFunctionPointer)( QthreadsExec &, const void * );
+
+class QthreadsExec {
+private:
+  enum { Inactive = 0, Active = 1 };
+
+  const QthreadsExec * const * m_worker_base;
+  const QthreadsExec * const * m_shepherd_base;
+
+  void  * m_scratch_alloc;  ///< Scratch memory [ reduce, team, shared ]
+  int     m_reduce_end;     ///< End of scratch reduction memory
+
+  int     m_shepherd_rank;
+  int     m_shepherd_size;
+
+  int     m_shepherd_worker_rank;
+  int     m_shepherd_worker_size;
+
+  /*
+   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
+   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
+   */
+  int     m_worker_rank;
+  int     m_worker_size;
+
+  int mutable volatile m_worker_state;
+
+  friend class Kokkos::Qthreads;
+
+  ~QthreadsExec();
+  QthreadsExec( const QthreadsExec & );
+  QthreadsExec & operator = ( const QthreadsExec & );
+
+public:
+  QthreadsExec();
+
+  /** Execute the input function on all available Qthreads workers. */
+  static void exec_all( Qthreads &, QthreadsExecFunctionPointer, const void * );
+
+  /** Barrier across all workers participating in the 'exec_all'. */
+  void exec_all_barrier() const
+  {
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  /** Barrier across workers within the shepherd with rank < team_rank. */
+  void shepherd_barrier( const int team_size ) const
+  {
+    if ( m_shepherd_worker_rank < team_size ) {
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n, j;
+
+      for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+        Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadsExec::Inactive;
+        Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+      }
+
+      for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+      }
+    }
+  }
+
+  /** Reduce across all workers participating in the 'exec_all'. */
+  template< class FunctorType, class ReducerType, class ArgTag >
+  inline
+  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
+  {
+    typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+    typedef typename ReducerConditional::type ReducerTypeFwd;
+    typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin;
+
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      const QthreadsExec & fan = *m_worker_base[j];
+
+      Impl::spinwait_while_equal( fan.m_worker_state, QthreadsExec::Active );
+
+      ValueJoin::join( ReducerConditional::select( func, reduce ), m_scratch_alloc, fan.m_scratch_alloc );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  /** Scan across all workers participating in the 'exec_all'. */
+  template< class FunctorType, class ArgTag >
+  inline
+  void exec_all_scan( const FunctorType & func ) const
+  {
+    typedef Kokkos::Impl::FunctorValueInit< FunctorType, ArgTag > ValueInit;
+    typedef Kokkos::Impl::FunctorValueJoin< FunctorType, ArgTag > ValueJoin;
+    typedef Kokkos::Impl::FunctorValueOps<  FunctorType, ArgTag > ValueOps;
+
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      // Root thread scans across values before releasing threads.
+      // Worker data is in reverse order, so m_worker_base[0] is the
+      // highest ranking thread.
+
+      // Copy from lower ranking to higher ranking worker.
+      for ( int i = 1; i < m_worker_size; ++i ) {
+        ValueOps::copy( func
+                      , m_worker_base[i-1]->m_scratch_alloc
+                      , m_worker_base[i]->m_scratch_alloc
+                      );
+      }
+
+      ValueInit::init( func, m_worker_base[m_worker_size-1]->m_scratch_alloc );
+
+      // Join from lower ranking to higher ranking worker.
+      // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
+      for ( int i = m_worker_size - 1; --i > 0; ) {
+        ValueJoin::join( func, m_worker_base[i-1]->m_scratch_alloc, m_worker_base[i]->m_scratch_alloc );
+      }
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  //----------------------------------------
+
+  template< class Type >
+  inline
+  volatile Type * shepherd_team_scratch_value() const
+  { return (volatile Type*)( ( (unsigned char *) m_scratch_alloc ) + m_reduce_end ); }
+
+  template< class Type >
+  inline
+  void shepherd_broadcast( Type & value, const int team_size, const int team_rank ) const
+  {
+    if ( m_shepherd_base ) {
+      Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value; }
+      memory_fence();
+      shepherd_barrier( team_size );
+      value = *shared_value;
+    }
+  }
+
+  template< class Type >
+  inline
+  Type shepherd_reduce( const int team_size, const Type & value ) const
+  {
+    volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
+    *shared_value = value;
+//    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < n; ++i ) {
+        accum += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+      }
+      for ( int i = 1; i < n; ++i ) {
+        *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  template< class JoinOp >
+  inline
+  typename JoinOp::value_type
+  shepherd_reduce( const int team_size
+                 , const typename JoinOp::value_type & value
+                 , const JoinOp & op ) const
+  {
+    typedef typename JoinOp::value_type Type;
+
+    volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
+    *shared_value = value;
+//    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      volatile Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < team_size; ++i ) {
+        op.join( accum, *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
+      }
+      for ( int i = 1; i < team_size; ++i ) {
+        *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  template< class Type >
+  inline
+  Type shepherd_scan( const int team_size
+                    , const Type & value
+                    ,       Type * const global_value = 0 ) const
+  {
+    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      // Root thread scans across values before releasing threads.
+      // Worker data is in reverse order, so m_shepherd_base[0] is the
+      // highest ranking thread.
+
+      // Copy from lower ranking to higher ranking worker.
+
+      Type accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < team_size; ++i ) {
+        const Type tmp = *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+        accum += tmp;
+        *m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp;
+      }
+
+      *m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
+        global_value ? atomic_fetch_add( global_value, accum ) : 0;
+
+      // Join from lower ranking to higher ranking worker.
+      for ( int i = team_size; --i; ) {
+        *m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  //----------------------------------------
+
+  static inline
+  int align_alloc( int size )
+  {
+    enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */ };
+    enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
+    return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK;
+  }
+
+  void shared_reset( Qthreads::scratch_memory_space & );
+
+  void * exec_all_reduce_value() const { return m_scratch_alloc; }
+
+  static void * exec_all_reduce_result();
+
+  static void resize_worker_scratch( const int reduce_size, const int shared_size );
+  static void clear_workers();
+
+  //----------------------------------------
+
+  inline int worker_rank() const { return m_worker_rank; }
+  inline int worker_size() const { return m_worker_size; }
+  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank; }
+  inline int shepherd_worker_size() const { return m_shepherd_worker_size; }
+  inline int shepherd_rank() const { return m_shepherd_rank; }
+  inline int shepherd_size() const { return m_shepherd_size; }
+
+  static int worker_per_shepherd();
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+class QthreadsTeamPolicyMember {
+private:
+  typedef Kokkos::Qthreads                       execution_space;
+  typedef execution_space::scratch_memory_space  scratch_memory_space;
+
+  Impl::QthreadsExec   & m_exec;
+  scratch_memory_space   m_team_shared;
+  const int              m_team_size;
+  const int              m_team_rank;
+  const int              m_league_size;
+  const int              m_league_end;
+        int              m_league_rank;
+
+public:
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_team_shared; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  {}
+#else
+  { m_exec.shepherd_barrier( m_team_size ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value, int rank ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_broadcast<Type>( value, m_team_size, rank ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_reduce<Type>( m_team_size, value ); }
+#endif
+
+  template< typename JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+  team_reduce( const typename JoinOp::value_type & value
+             , const JoinOp & op ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return typename JoinOp::value_type(); }
+#else
+  { return m_exec.template shepherd_reduce<JoinOp>( m_team_size, value, op ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_scan<Type>( m_team_size, value ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the league's
+   *  parallel execution, be the scan's total.  Parallel execution ordering of
+   *  the league's teams is non-deterministic.  As such the base value for each
+   *  team's scan operation is similarly non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value, Type * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_scan<Type>( m_team_size, value, global_accum ); }
+#endif
+
+  //----------------------------------------
+  // Private driver for task-team parallel.
+
+  struct TaskTeam {};
+
+  QthreadsTeamPolicyMember();
+  explicit QthreadsTeamPolicyMember( const TaskTeam & );
+
+  //----------------------------------------
+  // Private for the driver ( for ( member_type i( exec, team ); i; i.next_team() ) { ... }
+
+  // Initialize.
+  template< class ... Properties >
+  QthreadsTeamPolicyMember( Impl::QthreadsExec & exec
+                          , const Kokkos::Impl::TeamPolicyInternal< Qthreads, Properties... > & team )
+    : m_exec( exec )
+    , m_team_shared( 0, 0 )
+    , m_team_size( team.m_team_size )
+    , m_team_rank( exec.shepherd_worker_rank() )
+    , m_league_size( team.m_league_size )
+    , m_league_end( team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
+    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
+  {
+    m_exec.shared_reset( m_team_shared );
+  }
+
+  // Continue.
+  operator bool () const { return m_league_rank < m_league_end; }
+
+  // Iterate.
+  void next_team() { ++m_league_rank; m_exec.shared_reset( m_team_shared ); }
+};
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Qthreads, Properties ... >
+  : public PolicyTraits< Properties... >
+{
+private:
+  const int m_league_size;
+  const int m_team_size;
+  const int m_shepherd_iter;
+
+public:
+  //! Tag this class as a kokkos execution policy.
+  typedef TeamPolicyInternal              execution_policy;
+  typedef Qthreads                        execution_space;
+  typedef PolicyTraits< Properties ... >  traits;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+  { return Qthreads::instance().shepherd_worker_size(); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & f )
+  { return team_size_max( f ); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & f, const int& )
+  { return team_size_max( f ); }
+
+  //----------------------------------------
+
+  inline int team_size()   const { return m_team_size; }
+  inline int league_size() const { return m_league_size; }
+
+  // One active team per shepherd.
+  TeamPolicyInternal( Kokkos::Qthreads & q
+                    , const int league_size
+                    , const int team_size
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( team_size < q.shepherd_worker_size()
+                 ? team_size : q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+  {}
+
+  // TODO: Make sure this is correct.
+  // One active team per shepherd.
+  TeamPolicyInternal( Kokkos::Qthreads & q
+                    , const int league_size
+                    , const Kokkos::AUTO_t & /* team_size_request */
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+  {}
+
+  // One active team per shepherd.
+  TeamPolicyInternal( const int league_size
+                    , const int team_size
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( team_size < Qthreads::instance().shepherd_worker_size()
+                 ? team_size : Qthreads::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
+  {}
+
+  // TODO: Make sure this is correct.
+  // One active team per shepherd.
+  TeamPolicyInternal( const int league_size
+                    , const Kokkos::AUTO_t & /* team_size_request */
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( Qthreads::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
+  {}
+
+  // TODO: Doesn't do anything yet.  Fix this.
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+//    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  typedef Impl::QthreadsTeamPolicyMember member_type;
+
+  friend class Impl::QthreadsTeamPolicyMember;
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif // #define KOKKOS_QTHREADSEXEC_HPP
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
@ -41,8 +41,8 @@
 //@HEADER
 */

-#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
-#define KOKKOS_QTHREAD_PARALLEL_HPP
+#ifndef KOKKOS_QTHREADS_PARALLEL_HPP
+#define KOKKOS_QTHREADS_PARALLEL_HPP

 #include <vector>

@ -51,7 +51,7 @@
 #include <impl/Kokkos_StaticAssert.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>

-#include <Qthread/Kokkos_QthreadExec.hpp>
+#include <Qthreads/Kokkos_QthreadsExec.hpp>

 //----------------------------------------------------------------------------

@ -63,7 +63,7 @@ namespace Impl {
 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType
                 , Kokkos::RangePolicy< Traits ... >
-                 , Kokkos::Qthread
+                 , Kokkos::Qthreads
                 >
 {
 private:
@ -99,7 +99,7 @@ private:
    }

  // Function is called once by every concurrent thread.
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
  {
    const ParallelFor & self = * ((const ParallelFor *) arg );

@ -116,7 +116,7 @@ public:
  inline
  void execute() const
    {
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );

    }

@ -134,7 +134,7 @@ template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
                    , Kokkos::RangePolicy< Traits ... >
                    , ReducerType
-                    , Kokkos::Qthread
+                    , Kokkos::Qthreads
                    >
 {
 private:
@ -186,7 +186,7 @@ private:
      }
    }

-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
  {
    const ParallelReduce & self = * ((const ParallelReduce *) arg );

@ -205,10 +205,10 @@ public:
  inline
  void execute() const
    {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
+      QthreadsExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );

-      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+      const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();

      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );

@ -246,11 +246,11 @@ public:
 template< class FunctorType , class ... Properties >
 class ParallelFor< FunctorType
                 , TeamPolicy< Properties ... >
-                 , Kokkos::Qthread >
+                 , Kokkos::Qthreads >
 {
 private:

-  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
  typedef typename Policy::member_type  Member ;
  typedef typename Policy::work_tag     WorkTag ;

@ -282,7 +282,7 @@ private:
      }
    }

-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
  {
    const ParallelFor & self = * ((const ParallelFor *) arg );

@ -297,10 +297,10 @@ public:
  inline
  void execute() const
    {
-      QthreadExec::resize_worker_scratch
+      QthreadsExec::resize_worker_scratch
        ( /* reduction   memory */ 0
        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
    }

  ParallelFor( const FunctorType & arg_functor ,
@ -316,12 +316,12 @@ template< class FunctorType , class ReducerType , class ... Properties >
 class ParallelReduce< FunctorType
                    , TeamPolicy< Properties... >
                    , ReducerType
-                    , Kokkos::Qthread
+                    , Kokkos::Qthreads
                    >
 {
 private:

-  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;

  typedef typename Policy::work_tag     WorkTag ;
  typedef typename Policy::member_type  Member ;
@ -365,7 +365,7 @@ private:
      }
    }

-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
  {
    const ParallelReduce & self = * ((const ParallelReduce *) arg );

@ -383,13 +383,13 @@ public:
  inline
  void execute() const
    {
-      QthreadExec::resize_worker_scratch
+      QthreadsExec::resize_worker_scratch
        ( /* reduction   memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );

-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );

-      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+      const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();

      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );

@ -429,7 +429,7 @@ public:
 template< class FunctorType , class ... Traits >
 class ParallelScan< FunctorType
                  , Kokkos::RangePolicy< Traits ... >
-                  , Kokkos::Qthread
+                  , Kokkos::Qthreads
                  >
 {
 private:
@ -474,7 +474,7 @@ private:
      }
    }

-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
  {
    const ParallelScan & self = * ((const ParallelScan *) arg );

@ -497,8 +497,8 @@ public:
  inline
  void execute() const
    {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::exec , this );
+      QthreadsExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelScan::exec , this );
    }

  ParallelScan( const FunctorType & arg_functor
@ -521,37 +521,37 @@ namespace Kokkos {

 template< typename iType >
 KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >
-TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread, const iType& count )
+Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >
+TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType& count )
 {
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >( thread, count );
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, count );
 }

 template< typename iType1, typename iType2 >
 KOKKOS_INLINE_FUNCTION
 Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::QthreadTeamPolicyMember >
-TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
+                                       Impl::QthreadsTeamPolicyMember >
+TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
 {
  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >( thread, iType(begin), iType(end) );
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, iType(begin), iType(end) );
 }

 template<typename iType>
 KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >
-  ThreadVectorRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >(thread,count);
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >
+  ThreadVectorRange(const Impl::QthreadsTeamPolicyMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >(thread,count);
 }

 KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember> PerTeam(const Impl::QthreadTeamPolicyMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember> PerTeam(const Impl::QthreadsTeamPolicyMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
 }

 KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::QthreadTeamPolicyMember& thread) {
-  return Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember> PerThread(const Impl::QthreadsTeamPolicyMember& thread) {
+  return Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
 }

 /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
@ -560,7 +560,7 @@ Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::Qt
 * This functionality requires C++11 support.*/
 template<typename iType, class Lambda>
 KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
    lambda(i);
 }
@ -571,7 +571,7 @@ void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qthrea
 * val is performed and put into result. This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
                     const Lambda & lambda, ValueType& result) {

  result = ValueType();
@ -595,7 +595,7 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qth
 * '1 for *'). This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {

  ValueType result = init_result;
@ -615,7 +615,7 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qth
 * This functionality requires C++11 support.*/
 template<typename iType, class Lambda>
 KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
    loop_boundaries, const Lambda& lambda) {
  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
  #pragma ivdep
@ -630,7 +630,7 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Qthr
 * val is performed and put into result. This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
      loop_boundaries, const Lambda & lambda, ValueType& result) {
  result = ValueType();
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
@ -652,7 +652,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Q
 * '1 for *'). This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {

  ValueType result = init_result;
@ -679,7 +679,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Q
 * This functionality requires C++11 support.*/
 template< typename iType, class FunctorType >
 KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
      loop_boundaries, const FunctorType & lambda) {

  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
@ -697,25 +697,25 @@ void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Qth

 template<class FunctorType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
  lambda();
 }

 template<class FunctorType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
  if(single_struct.team_member.team_rank()==0) lambda();
 }

 template<class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
  lambda(val);
 }

 template<class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
  if(single_struct.team_member.team_rank()==0) {
    lambda(val);
  }
@ -724,4 +724,4 @@ void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& singl

 } // namespace Kokkos

-#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */
+#endif /* #define KOKKOS_QTHREADS_PARALLEL_HPP */
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
@ -0,0 +1,320 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Qthreads > ;
+
+//----------------------------------------------------------------------------
+
+TaskExec< Kokkos::Qthreads >::TaskExec()
+  : m_self_exec( 0 ),
+    m_team_exec( 0 ),
+    m_sync_mask( 0 ),
+    m_sync_value( 0 ),
+    m_sync_step( 0 ),
+    m_group_rank( 0 ),
+    m_team_rank( 0 ),
+    m_team_size( 1 )
+{}
+
+TaskExec< Kokkos::Qthreads >::
+TaskExec( Kokkos::Impl::QthreadsExec & arg_exec, int const arg_team_size )
+  : m_self_exec( & arg_exec ),
+    m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) ),
+    m_sync_mask( 0 ),
+    m_sync_value( 0 ),
+    m_sync_step( 0 ),
+    m_group_rank( arg_exec.pool_rank_rev() / arg_team_size ),
+    m_team_rank( arg_exec.pool_rank_rev() % arg_team_size ),
+    m_team_size( arg_team_size )
+{
+  // This team spans
+  //    m_self_exec->pool_rev( team_size * group_rank )
+  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
+
+  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
+
+  sync[0] = int64_t(0) ;
+  sync[1] = int64_t(0) ;
+
+  for ( int i = 0 ; i < m_team_size ; ++i ) {
+    m_sync_value |= int64_t(1) << (8*i);
+    m_sync_mask  |= int64_t(3) << (8*i);
+  }
+
+  Kokkos::memory_fence();
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void TaskExec< Kokkos::Qthreads >::team_barrier() const
+{
+  if ( 1 < m_team_size ) {
+
+    if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
+      Kokkos::abort("TaskQueue<Qthreads> scratch_reduce memory too small");
+    }
+
+    // Use team shared memory to synchronize.
+    // Alternate memory locations between barriers to avoid a sequence
+    // of barriers overtaking one another.
+
+    int64_t volatile * const sync =
+      ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+
+    // This team member sets one byte within the sync variable
+    int8_t volatile * const sync_self =
+     ((int8_t *) sync) + m_team_rank ;
+
+#if 0
+fprintf( stdout,
+         "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n",
+         m_group_rank,
+         m_team_rank,
+         m_sync_step,
+         m_sync_value,
+         *sync
+       );
+fflush(stdout);
+#endif
+
+    *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
+
+    while ( m_sync_value != *sync ); // wait for team to arrive
+
+#if 0
+fprintf( stdout,
+         "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n",
+         m_group_rank,
+         m_team_rank,
+         m_sync_step,
+         m_sync_value,
+         *sync
+       );
+fflush(stdout);
+#endif
+
+    ++m_sync_step ;
+
+    if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
+      m_sync_value ^= m_sync_mask ;
+      if ( 1000 < m_sync_step ) m_sync_step = 0 ;
+    }
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+void TaskQueueSpecialization< Kokkos::Qthreads >::execute
+  ( TaskQueue< Kokkos::Qthreads > * const queue )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space, void, void > ;
+  using PoolExec        = Kokkos::Impl::QthreadsExec ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  // Required:  team_size <= 8
+
+  const int team_size = PoolExec::pool_size(2); // Threads per core
+  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+
+  if ( 8 < team_size ) {
+    Kokkos::abort("TaskQueue<Qthreads> unsupported team size");
+  }
+
+#pragma omp parallel
+  {
+    PoolExec & self = *PoolExec::get_thread_omp();
+
+    Member single_exec ;
+    Member team_exec( self, team_size );
+
+    // Team shared memory
+    task_root_type * volatile * const task_shared =
+      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+
+// Barrier across entire Qthreads thread pool to insure initialization
+#pragma omp barrier
+
+    // Loop until all queues are empty and no tasks in flight
+
+    do {
+
+      // Each team lead attempts to acquire either a thread team task
+      // or collection of single thread tasks for the team.
+
+      if ( 0 == team_exec.team_rank() ) {
+
+        task_root_type * tmp =
+          0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+        // Loop by priority and then type
+        for ( int i = 0 ; i < queue_type::NumQueue && end == tmp ; ++i ) {
+          for ( int j = 0 ; j < 2 && end == tmp ; ++j ) {
+            tmp = queue_type::pop_task( & queue->m_ready[i][j] );
+          }
+        }
+
+        *task_shared = tmp ;
+
+        // Fence to be sure shared_task_array is stored
+        Kokkos::memory_fence();
+      }
+
+      // Whole team waits for every team member to reach this statement
+      team_exec.team_barrier();
+
+      Kokkos::memory_fence();
+
+      task_root_type * const task = *task_shared ;
+
+#if 0
+fprintf( stdout,
+         "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n",
+         team_exec.m_group_rank,
+         team_exec.m_team_rank,
+         uintptr_t(task_shared),
+         uintptr_t(task)
+       );
+fflush(stdout);
+#endif
+
+      if ( 0 == task ) break ; // 0 == m_ready_count
+
+      if ( end == task ) {
+        team_exec.team_barrier();
+      }
+      else if ( task_root_type::TaskTeam == task->m_task_type ) {
+        // Thread Team Task
+        (*task->m_apply)( task, & team_exec );
+
+        // The m_apply function performs a barrier
+
+        if ( 0 == team_exec.team_rank() ) {
+          // team member #0 completes the task, which may delete the task
+          queue->complete( task );
+        }
+      }
+      else {
+        // Single Thread Task
+
+        if ( 0 == team_exec.team_rank() ) {
+
+          (*task->m_apply)( task, & single_exec );
+
+          queue->complete( task );
+        }
+
+        // All team members wait for whole team to reach this statement.
+        // Not necessary to complete the task.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+    } while(1);
+  }
+// END #pragma omp parallel
+
+}
+
+void TaskQueueSpecialization< Kokkos::Qthreads >::
+  iff_single_thread_recursive_execute
+    ( TaskQueue< Kokkos::Qthreads > * const queue )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space, void, void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  if ( 1 == omp_get_num_threads() ) {
+
+    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+    Member single_exec ;
+
+    task_root_type * task = end ;
+
+    do {
+
+      task = end ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+          task = queue_type::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+      if ( end == task ) break ;
+
+      (*task->m_apply)( task, & single_exec );
+
+      queue->complete( task );
+
+    } while(1);
+  }
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP
+#define KOKKOS_IMPL_QTHREADS_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueueSpecialization< Kokkos::Qthreads >
+{
+public:
+
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space, void, void > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< execution_space,
+                                 typename FunctorType::value_type,
+                                 FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::Qthreads > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::Qthreads >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  using PoolExec = Kokkos::Impl::QthreadsExec ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Qthreads > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Qthreads > ;
+
+  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure
+  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
+  int64_t          m_sync_mask ;
+  int64_t mutable  m_sync_value ;
+  int     mutable  m_sync_step ;
+  int              m_group_rank ; ///< Which "team" subset of thread pool
+  int              m_team_rank ;  ///< Which thread within a team
+  int              m_team_size ;
+
+  TaskExec();
+  TaskExec( PoolExec & arg_exec, int arg_team_size );
+
+public:
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  void * team_shared() const
+    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
+
+  int team_shared_size() const
+    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
+
+  /**\brief  Whole team enters this function call
+   *         before any teeam member returns from
+   *         this function call.
+   */
+  void team_barrier() const ;
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const { return m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const { return m_team_size ; }
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP */
+
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
@ -41,11 +41,11 @@
 //@HEADER
 */

-// Experimental unified task-data parallel manycore LDRD
+// Experimental unified task-data parallel manycore LDRD.

 #include <Kokkos_Core_fwd.hpp>

-#if defined( KOKKOS_ENABLE_QTHREAD )
+#if defined( KOKKOS_ENABLE_QTHREADS )

 #include <stdio.h>

@ -56,17 +56,15 @@
 #include <string>

 #include <Kokkos_Atomic.hpp>
-#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
+#include <Qthreads/Kokkos_Qthreads_TaskPolicy.hpp>

 #if defined( KOKKOS_ENABLE_TASKDAG )

-//----------------------------------------------------------------------------
-
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {

-typedef TaskMember< Kokkos::Qthread , void , void > Task ;
+typedef TaskMember< Kokkos::Qthreads , void , void > Task ;

 namespace {

@ -173,16 +171,16 @@ Task::TaskMember( const function_dealloc_type  arg_dealloc

 void Task::throw_error_add_dependence() const
 {
-  std::cerr << "TaskMember< Qthread >::add_dependence ERROR"
+  std::cerr << "TaskMember< Qthreads >::add_dependence ERROR"
            << " state(" << m_state << ")"
            << " dep_size(" << m_dep_size << ")"
            << std::endl ;
-  throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR");
+  throw std::runtime_error("TaskMember< Qthreads >::add_dependence ERROR");
 }

 void Task::throw_error_verify_type()
 {
-  throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR");
+  throw std::runtime_error("TaskMember< Qthreads >::verify_type ERROR");
 }

 //----------------------------------------------------------------------------
@ -190,7 +188,7 @@ void Task::throw_error_verify_type()
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
 void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
 {
-  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ;
+  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthreads>::assign ERROR" ;
  static const char msg_error_count[]       = ": negative reference count" ;
  static const char msg_error_complete[]    = ": destroy task that is not complete" ;
  static const char msg_error_dependences[] = ": destroy task that has dependences" ;
@ -294,7 +292,7 @@ fflush(stdout);
      assign( & m_dep[i] , 0 );
    }

-    // Set qthread FEB to full so that dependent tasks are allowed to execute.
+    // Set Qthreads FEB to full so that dependent tasks are allowed to execute.
    // This 'task' may be deleted immediately following this function call.
    qthread_fill( & m_qfeb );

@ -319,10 +317,10 @@ aligned_t Task::qthread_func( void * arg )
                                        );

  if ( task->m_apply_team && ! task->m_apply_single ) {
-    Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
+    Kokkos::Impl::QthreadsTeamPolicyMember::TaskTeam task_team_tag ;

    // Initialize team size and rank with shephered info
-    Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
+    Kokkos::Impl::QthreadsTeamPolicyMember member( task_team_tag );

    (*task->m_apply_team)( task , member );

@ -344,7 +342,7 @@ fflush(stdout);
  }
  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
    // Team hard-wired to one, no cloning
-    Kokkos::Impl::QthreadTeamPolicyMember member ;
+    Kokkos::Impl::QthreadsTeamPolicyMember member ;
    (*task->m_apply_team)( task , member );
    task->closeout();
  }
@ -384,8 +382,8 @@ void Task::schedule()
  // Increment active task count before spawning.
  Kokkos::atomic_increment( m_active_count );

-  // spawn in qthread.  must malloc the precondition array and give to qthread.
-  // qthread will eventually free this allocation so memory will not be leaked.
+  // spawn in Qthreads.  must malloc the precondition array and give to Qthreads.
+  // Qthreads will eventually free this allocation so memory will not be leaked.

  // concern with thread safety of malloc, does this need to be guarded?
  aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
@ -393,7 +391,7 @@ void Task::schedule()
  qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );

  for ( int i = 0 ; i < m_dep_size ; ++i ) {
-    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
+    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthreads precondition flag
  }

  if ( m_apply_team && ! m_apply_single ) {
@ -446,7 +444,7 @@ fflush(stdout);
 namespace Kokkos {
 namespace Experimental {

-TaskPolicy< Kokkos::Qthread >::
+TaskPolicy< Kokkos::Qthreads >::
 TaskPolicy
  ( const unsigned /* arg_task_max_count */
  , const unsigned /* arg_task_max_size */
@ -462,7 +460,7 @@ TaskPolicy

  if ( m_team_size != 1 && m_team_size != num_worker_per_shepherd ) {
    std::ostringstream msg ;
-    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthread >( "
+    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads >( "
        << "default_depedence = " << arg_task_default_dependence_capacity
        << " , team_size = " << arg_task_team_size
        << " ) ERROR, valid team_size arguments are { (omitted) , 1 , " << num_worker_per_shepherd << " }" ;
@ -470,14 +468,14 @@ TaskPolicy
  }
 }

-TaskPolicy< Kokkos::Qthread >::member_type &
-TaskPolicy< Kokkos::Qthread >::member_single()
+TaskPolicy< Kokkos::Qthreads >::member_type &
+TaskPolicy< Kokkos::Qthreads >::member_single()
 {
  static member_type s ;
  return s ;
 }

-void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads > & policy )
 {
  volatile int * const active_task_count = & policy.m_active_count ;
  while ( *active_task_count ) qthread_yield();
@ -486,6 +484,5 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
 } // namespace Experimental
 } // namespace Kokkos

-#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
-#endif /* #if defined( KOKKOS_ENABLE_QTHREAD ) */
-
+#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
+#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
@ -43,15 +43,15 @@

 // Experimental unified task-data parallel manycore LDRD

-#ifndef KOKKOS_QTHREAD_TASKSCHEDULER_HPP
-#define KOKKOS_QTHREAD_TASKSCHEDULER_HPP
+#ifndef KOKKOS_QTHREADS_TASKSCHEDULER_HPP
+#define KOKKOS_QTHREADS_TASKSCHEDULER_HPP

 #include <string>
 #include <typeinfo>
 #include <stdexcept>

 //----------------------------------------------------------------------------
-// Defines to enable experimental Qthread functionality
+// Defines to enable experimental Qthreads functionality

 #define QTHREAD_LOCAL_PRIORITY
 #define CLONED_TASKS
@ -63,7 +63,7 @@

 //----------------------------------------------------------------------------

-#include <Kokkos_Qthread.hpp>
+#include <Kokkos_Qthreads.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_View.hpp>

@ -78,13 +78,13 @@ namespace Experimental {
 namespace Impl {

 template<>
-class TaskMember< Kokkos::Qthread , void , void >
+class TaskMember< Kokkos::Qthreads , void , void >
 {
 public:

  typedef TaskMember * (* function_verify_type) ( TaskMember * );
  typedef void         (* function_single_type) ( TaskMember * );
-  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
+  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadsTeamPolicyMember & );
  typedef void         (* function_dealloc_type)( TaskMember * );

 private:
@ -94,7 +94,7 @@ private:
  const function_single_type   m_apply_single ;  ///< Apply function
  const function_team_type     m_apply_team ;    ///< Apply function
  int volatile * const         m_active_count ;  ///< Count of active tasks on this policy
-  aligned_t                    m_qfeb ;          ///< Qthread full/empty bit
+  aligned_t                    m_qfeb ;          ///< Qthreads full/empty bit
  TaskMember ** const          m_dep ;           ///< Dependences
  const int                    m_dep_capacity ;  ///< Capacity of dependences
  int                          m_dep_size ;      ///< Actual count of dependences
@ -129,7 +129,7 @@ protected :

  ~TaskMember();

-  // Used by TaskMember< Qthread , ResultType , void >
+  // Used by TaskMember< Qthreads , ResultType , void >
  TaskMember( const function_verify_type   arg_verify
            , const function_dealloc_type  arg_dealloc
            , const function_single_type   arg_apply_single
@ -139,7 +139,7 @@ protected :
            , const unsigned               arg_dependence_capacity
            );

-  // Used for TaskMember< Qthread , void , void >
+  // Used for TaskMember< Qthreads , void , void >
  TaskMember( const function_dealloc_type  arg_dealloc
            , const function_single_type   arg_apply_single
            , const function_team_type     arg_apply_team
@ -175,15 +175,15 @@ public:
  /*  Inheritence Requirements on task types:
   *    typedef  FunctorType::value_type  value_type ;
   *    class DerivedTaskType
-   *      : public TaskMember< Qthread , value_type , FunctorType >
+   *      : public TaskMember< Qthreads , value_type , FunctorType >
   *      { ... };
-   *    class TaskMember< Qthread , value_type , FunctorType >
-   *      : public TaskMember< Qthread , value_type , void >
+   *    class TaskMember< Qthreads , value_type , FunctorType >
+   *      : public TaskMember< Qthreads , value_type , void >
   *      , public Functor
   *      { ... };
   *  If value_type != void
-   *    class TaskMember< Qthread , value_type , void >
-   *      : public TaskMember< Qthread , void , void >
+   *    class TaskMember< Qthreads , value_type , void >
+   *      : public TaskMember< Qthreads , void , void >
   *
   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
   *
@ -300,10 +300,10 @@ public:
  KOKKOS_INLINE_FUNCTION static
  void apply_single( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t )
    {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;

-      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      // TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthreads , ResultType , void >
      //   , public FunctorType
      //   { ... };

@ -316,10 +316,10 @@ public:
  KOKKOS_INLINE_FUNCTION static
  void apply_single( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t )
    {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;

-      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      // TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthreads , ResultType , void >
      //   , public FunctorType
      //   { ... };

@ -333,9 +333,9 @@ public:
  template< class FunctorType , class ResultType >
  KOKKOS_INLINE_FUNCTION static
  void apply_team( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t
-                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+                 , Kokkos::Impl::QthreadsTeamPolicyMember & member )
    {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;

      derived_type & m = * static_cast< derived_type * >( t );

@ -345,9 +345,9 @@ public:
  template< class FunctorType , class ResultType >
  KOKKOS_INLINE_FUNCTION static
  void apply_team( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t
-                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+                 , Kokkos::Impl::QthreadsTeamPolicyMember & member )
    {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;

      derived_type & m = * static_cast< derived_type * >( t );

@ -356,7 +356,7 @@ public:
 };

 //----------------------------------------------------------------------------
-/** \brief  Base class for tasks with a result value in the Qthread execution space.
+/** \brief  Base class for tasks with a result value in the Qthreads execution space.
 *
 *  The FunctorType must be void because this class is accessed by the
 *  Future class for the task and result value.
@ -365,8 +365,8 @@ public:
 *  can correctly static_cast from the 'root class' to this class.
 */
 template < class ResultType >
-class TaskMember< Kokkos::Qthread , ResultType , void >
-  : public TaskMember< Kokkos::Qthread , void , void >
+class TaskMember< Kokkos::Qthreads , ResultType , void >
+  : public TaskMember< Kokkos::Qthreads , void , void >
 {
 public:

@ -379,7 +379,7 @@ public:

 protected:

-  typedef TaskMember< Kokkos::Qthread , void , void >  task_root_type ;
+  typedef TaskMember< Kokkos::Qthreads , void , void >  task_root_type ;
  typedef task_root_type::function_dealloc_type        function_dealloc_type ;
  typedef task_root_type::function_single_type         function_single_type ;
  typedef task_root_type::function_team_type           function_team_type ;
@ -404,16 +404,16 @@ protected:
 };

 template< class ResultType , class FunctorType >
-class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-  : public TaskMember< Kokkos::Qthread , ResultType , void >
+class TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Qthreads , ResultType , void >
  , public FunctorType
 {
 public:

  typedef FunctorType  functor_type ;

-  typedef TaskMember< Kokkos::Qthread , void , void >        task_root_type ;
-  typedef TaskMember< Kokkos::Qthread , ResultType , void >  task_base_type ;
+  typedef TaskMember< Kokkos::Qthreads , void , void >        task_root_type ;
+  typedef TaskMember< Kokkos::Qthreads , ResultType , void >  task_base_type ;
  typedef task_root_type::function_dealloc_type              function_dealloc_type ;
  typedef task_root_type::function_single_type               function_single_type ;
  typedef task_root_type::function_team_type                 function_team_type ;
@ -447,16 +447,16 @@ public:
 namespace Kokkos {
 namespace Experimental {

-void wait( TaskPolicy< Kokkos::Qthread > & );
+void wait( TaskPolicy< Kokkos::Qthreads > & );

 template<>
-class TaskPolicy< Kokkos::Qthread >
+class TaskPolicy< Kokkos::Qthreads >
 {
 public:

-  typedef Kokkos::Qthread                        execution_space ;
+  typedef Kokkos::Qthreads                        execution_space ;
  typedef TaskPolicy                             execution_policy ;
-  typedef Kokkos::Impl::QthreadTeamPolicyMember  member_type ;
+  typedef Kokkos::Impl::QthreadsTeamPolicyMember  member_type ;

 private:

@ -650,7 +650,7 @@ public:

  static member_type & member_single();

-  friend void wait( TaskPolicy< Kokkos::Qthread > & );
+  friend void wait( TaskPolicy< Kokkos::Qthreads > & );
 };

 } /* namespace Experimental */
@ -660,5 +660,5 @@ public:
 //----------------------------------------------------------------------------

 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
-#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
+#endif /* #define KOKKOS_QTHREADS_TASK_HPP */

--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
@ -0,0 +1,319 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Manage task allocation, deallocation, and scheduling.
+ *
+ *  Task execution is handled here directly for the Qthread implementation.
+ */
+template<>
+class TaskQueue< Kokkos::Qthread > {
+private:
+
+  using execution_space = Kokkos::Qthread ;
+  using memory_space    = Kokkos::HostSpace
+  using device_type     = Kokkos::Device< execution_space, memory_space > ;
+  using memory_pool     = Kokkos::Experimental::MemoryPool< device_type > ;
+  using task_root_type  = Kokkos::Impl::TaskBase< execution_space, void, void > ;
+
+  friend class Kokkos::TaskScheduler< execution_space > ;
+
+  struct Destroy {
+    TaskQueue * m_queue ;
+    void destroy_shared_allocation();
+  };
+
+  //----------------------------------------
+
+  enum : int { TASK_STATE_NULL         =  0,  ///<  Does not exist
+               TASK_STATE_CONSTRUCTING =  1,  ///<  Is under construction
+               TASK_STATE_WAITING      =  2,  ///<  Is waiting for execution
+               TASK_STATE_EXECUTING    =  4,  ///<  Is executing
+               TASK_STATE_RESPAWN      =  8,  ///<  Requested respawn
+               TASK_STATE_COMPLETE     = 16   ///<  Execution is complete
+             };
+
+  // Queue is organized as [ priority ][ type ]
+
+  memory_pool  m_memory ;
+  unsigned     m_team_size ;   // Number of threads in a team
+  long         m_accum_alloc ; // Accumulated number of allocations
+  int          m_count_alloc ; // Current number of allocations
+  int          m_max_alloc ;   // Maximum number of allocations
+  int          m_ready_count ; // Number of ready or executing
+
+  //----------------------------------------
+
+  ~TaskQueue();
+  TaskQueue() = delete ;
+  TaskQueue( TaskQueue && ) = delete ;
+  TaskQueue( TaskQueue const & ) = delete ;
+  TaskQueue & operator = ( TaskQueue && ) = delete ;
+  TaskQueue & operator = ( TaskQueue const & ) = delete ;
+
+  TaskQueue
+    ( const memory_space & arg_space,
+      unsigned const arg_memory_pool_capacity,
+      unsigned const arg_memory_pool_superblock_capacity_log2
+    );
+
+  // Schedule a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next is the dependence or zero
+  //   Postcondition:
+  //     task->m_next is linked list membership
+  KOKKOS_FUNCTION
+  void schedule( task_root_type * const );
+
+  // Reschedule a task
+  //   Precondition:
+  //     task is in Executing state
+  //     task->m_next == LockTag
+  //   Postcondition:
+  //     task is in Executing-Respawn state
+  //     task->m_next == 0 (no dependence)
+  KOKKOS_FUNCTION
+  void reschedule( task_root_type * );
+
+  // Complete a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next == LockTag  =>  task is complete
+  //     task->m_next != LockTag  =>  task is respawn
+  //   Postcondition:
+  //     task->m_wait == LockTag  =>  task is complete
+  //     task->m_wait != LockTag  =>  task is waiting
+  KOKKOS_FUNCTION
+  void complete( task_root_type * );
+
+public:
+
+  // If and only if the execution space is a single thread
+  // then execute ready tasks.
+  KOKKOS_INLINE_FUNCTION
+  void iff_single_thread_recursive_execute()
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      specialization::iff_single_thread_recursive_execute( this );
+#endif
+    }
+
+  void execute() { specialization::execute( this ); }
+
+  template< typename FunctorType >
+  void proc_set_apply( typename task_root_type::function_type * ptr )
+    {
+      specialization::template proc_set_apply< FunctorType >( ptr );
+    }
+
+  // Assign task pointer with reference counting of assigned tasks
+  template< typename LV, typename RV >
+  KOKKOS_FUNCTION static
+  void assign( TaskBase< execution_space, LV, void > ** const lhs,
+               TaskBase< execution_space, RV, void > *  const rhs )
+    {
+      using task_lhs = TaskBase< execution_space, LV, void > ;
+#if 0
+  {
+    printf( "assign( 0x%lx { 0x%lx %d %d }, 0x%lx { 0x%lx %d %d } )\n",
+            uintptr_t( lhs ? *lhs : 0 ),
+            uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 ),
+            int( lhs && *lhs ? (*lhs)->m_task_type : 0 ),
+            int( lhs && *lhs ? (*lhs)->m_ref_count : 0 ),
+            uintptr_t(rhs),
+            uintptr_t( rhs ? rhs->m_next : 0 ),
+            int( rhs ? rhs->m_task_type : 0 ),
+            int( rhs ? rhs->m_ref_count : 0 )
+          );
+    fflush( stdout );
+  }
+#endif
+
+      if ( *lhs )
+      {
+        const int count = Kokkos::atomic_fetch_add( &((*lhs)->m_ref_count), -1 );
+
+        if ( ( 1 == count ) && ( (*lhs)->m_state == TASK_STATE_COMPLETE ) ) {
+          // Reference count is zero and task is complete, deallocate.
+          (*lhs)->m_queue->deallocate( *lhs, (*lhs)->m_alloc_size );
+        }
+        else if ( count <= 1 ) {
+          Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
+        }
+
+        // GEM: Should I check that there are no dependences here?  Can the state
+        //      be set to complete while there are still dependences?
+      }
+
+      if ( rhs ) { Kokkos::atomic_fetch_add( &(rhs->m_ref_count), 1 ); }
+
+      // Force write of *lhs
+
+      *static_cast< task_lhs * volatile * >(lhs) = rhs ;
+
+      Kokkos::memory_fence();
+    }
+
+  KOKKOS_FUNCTION
+  size_t allocate_block_size( size_t n ); ///< Actual block size allocated
+
+  KOKKOS_FUNCTION
+  void * allocate( size_t n ); ///< Allocate from the memory pool
+
+  KOKKOS_FUNCTION
+  void deallocate( void * p, size_t n ); ///< Deallocate to the memory pool
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskBase< Kokkos::Qthread, void, void >
+{
+public:
+
+  enum : int16_t   { TaskTeam   = TaskBase< void, void, void >::TaskTeam,
+                     TaskSingle = TaskBase< void, void, void >::TaskSingle,
+                     Aggregate  = TaskBase< void, void, void >::Aggregate };
+
+  enum : uintptr_t { LockTag = TaskBase< void, void, void >::LockTag,
+                     EndTag  = TaskBase< void, void, void >::EndTag };
+
+  using execution_space = Kokkos::Qthread ;
+  using queue_type      = TaskQueue< execution_space > ;
+
+  template< typename > friend class Kokkos::TaskScheduler ;
+
+  typedef void (* function_type) ( TaskBase *, void * );
+
+  // sizeof(TaskBase) == 48
+
+  function_type  m_apply ;       ///< Apply function pointer
+  queue_type   * m_queue ;       ///< Queue in which this task resides
+  TaskBase     * m_dep ;         ///< Dependence
+  int32_t        m_ref_count ;   ///< Reference count
+  int32_t        m_alloc_size ;  ///< Allocation size
+  int32_t        m_dep_count ;   ///< Aggregate's number of dependences
+  int16_t        m_task_type ;   ///< Type of task
+  int16_t        m_priority ;    ///< Priority of runnable task
+  aligned_t      m_qfeb ;        ///< Qthread full/empty bit
+  int            m_state ;       ///< State of the task
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr TaskBase() noexcept
+    : m_apply(0),
+      m_queue(0),
+      m_dep(0),
+      m_ref_count(0),
+      m_alloc_size(0),
+      m_dep_count(0),
+      m_task_type( TaskSingle ),
+      m_priority( 1 /* TaskRegularPriority */ ),
+      m_qfeb(0),
+      m_state( queue_type::TASK_STATE_CONSTRUCTING )
+    {
+      qthread_empty( & m_qfeb ); // Set to full when complete
+    }
+
+  //----------------------------------------
+
+  static aligned_t qthread_func( void * arg );
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase ** aggregate_dependences()
+    { return reinterpret_cast<TaskBase**>( this + 1 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void requested_respawn()
+    { return m_state == queue_type::TASK_STATE_RESPAWN; }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskBase* dep )
+    {
+      // Assign dependence to m_dep.  It will be processed in the subsequent
+      // call to schedule.  Error if the dependence is reset.
+      if ( 0 != Kokkos::atomic_exchange( & m_dep, dep ) ) {
+        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
+      }
+
+      if ( 0 != dep ) {
+        // The future may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+        Kokkos::atomic_fetch_add( &(dep->m_ref_count), 1 );
+      }
+    }
+
+  using get_return_type = void ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_return_type get() const {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
--- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
@ -0,0 +1,436 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+void TaskQueue< ExecSpace >::Destroy::destroy_shared_allocation()
+{
+  m_queue->~TaskQueue();
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::TaskQueue
+  ( const TaskQueue< ExecSpace >::memory_space & arg_space,
+    unsigned const arg_memory_pool_capacity,
+    unsigned const arg_memory_pool_superblock_capacity_log2 )
+  : m_memory( arg_space,
+              arg_memory_pool_capacity,
+              arg_memory_pool_superblock_capacity_log2 )
+    m_team_size( unsigned( qthread_num_workers_local(NO_SHEPHERD) ) ),
+    m_accum_alloc(0),
+    m_count_alloc(0),
+    m_max_alloc(0),
+    m_ready_count(0)
+{}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::~TaskQueue()
+{
+  // Verify that ready count is zero.
+  if ( 0 != m_ready_count ) {
+    Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+size_t TaskQueue< ExecSpace >::allocate_block_size( size_t n )
+{
+  return m_memory.allocate_block_size( n );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void * TaskQueue< ExecSpace >::allocate( size_t n )
+{
+  void * const p = m_memory.allocate(n);
+
+  if ( p ) {
+    Kokkos::atomic_increment( & m_accum_alloc );
+    Kokkos::atomic_increment( & m_count_alloc );
+
+    if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
+  }
+
+  return p ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::deallocate( void * p, size_t n )
+{
+  m_memory.deallocate( p, n );
+  Kokkos::atomic_decrement( & m_count_alloc );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+#if 0
+  printf( "schedule( 0x%lx { %d %d %d }\n",
+          uintptr_t(task),
+          task->m_task_type,
+          task->m_priority,
+          task->m_ref_count );
+#endif
+
+  // The task has been constructed and is waiting to be executed.
+  task->m_state = TASK_STATE_WAITING ;
+
+  if ( task->m_task_type != task_root_type::Aggregate ) {
+    // Scheduling a single or team task.
+
+    // Increment active task count before spawning.
+    Kokkos::atomic_increment( m_ready_count );
+
+    if ( task->m_dep == 0 ) {
+      // Schedule a task with no dependences.
+
+      if ( task_root_type::TaskTeam == task->m_task_type && m_team_size > 1 ) {
+        // If more than one shepherd spawn on a shepherd other than this shepherd
+        const int num_shepherd  = qthread_num_shepherds();
+        const int this_shepherd = qthread_shep();
+        int spawn_shepherd      = ( this_shepherd + 1 ) % num_shepherd ;
+
+#if 0
+        fprintf( stdout,
+                 "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
+                 qthread_shep(),
+                 qthread_worker_local(NULL),
+                 reinterpret_cast<unsigned long>(this),
+                 spawn_shepherd,
+                 m_team_size - 1
+               );
+        fflush(stdout);
+#endif
+
+        qthread_spawn_cloneable(
+          & task_root_type::qthread_func,
+          task,
+          0,
+          NULL,
+          0, // no depenedences
+          0, // dependences array
+          spawn_shepherd,
+          unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
+          m_team_size - 1
+        );
+      }
+      else {
+        qthread_spawn(
+          & task_root_type::qthread_func,
+          task,
+          0,
+          NULL,
+          0, // no depenedences
+          0, // dependences array
+          NO_SHEPHERD,
+          QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+        );
+      }
+    }
+    else if ( task->m_dep->m_task_type != task_root_type::Aggregate )
+    // Malloc the precondition array to pass to qthread_spawn().  For
+    // non-aggregate tasks, it is a single pointer since there are no
+    // dependences.  Qthreads will eventually free this allocation so memory will
+    // not be leaked. Is malloc thread-safe?  Should this call be guarded?  The
+    // memory can't be allocated from the pool allocator because Qthreads frees
+    // it using free().
+    aligned_t ** qprecon = (aligned_t **) malloc( sizeof(aligned_t *) );
+
+    *qprecon = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
+
+    if ( task->m_task_type == task_root_type::TaskTeam && m_team_size > 1) {
+      // If more than one shepherd spawn on a shepherd other than this shepherd
+      const int num_shepherd  = qthread_num_shepherds();
+      const int this_shepherd = qthread_shep();
+      int spawn_shepherd      = ( this_shepherd + 1 ) % num_shepherd ;
+
+#if 0
+  fprintf( stdout,
+           "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
+           qthread_shep(),
+           qthread_worker_local(NULL),
+           reinterpret_cast<unsigned long>(this),
+           spawn_shepherd,
+           m_team_size - 1
+         );
+  fflush(stdout);
+#endif
+
+      qthread_spawn_cloneable(
+        & Task::qthread_func,
+        this,
+        0,
+        NULL,
+        m_dep_size,
+        qprecon, /* dependences */
+        spawn_shepherd,
+        unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
+        m_team_size - 1
+      );
+    }
+    else {
+      qthread_spawn(
+        & Task::qthread_func, /* function */
+        this,                 /* function argument */
+        0,
+        NULL,
+        m_dep_size,
+        qprecon, /* dependences */
+        NO_SHEPHERD,
+        QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+      );
+    }
+  }
+  else {
+    // GEM: How do I handle an aggregate (when_all) task?
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::reschedule( task_root_type * task )
+{
+  // Precondition:
+  //   task is in Executing state
+  //   task->m_next == LockTag
+  //
+  // Postcondition:
+  //   task is in Executing-Respawn state
+  //   task->m_next == 0 (no dependence)
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+
+  if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
+    Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::complete
+  ( TaskQueue< ExecSpace >::task_root_type * task )
+{
+  // Complete a runnable task that has finished executing
+  // or a when_all task when all of its dependeneces are complete.
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+#if 0
+  printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n",
+          uintptr_t(task),
+          uintptr_t(task->m_wait),
+          uintptr_t(task->m_next),
+          task->m_task_type,
+          task->m_priority,
+          task->m_ref_count
+        );
+  fflush( stdout );
+#endif
+
+  const bool runnable = task_root_type::Aggregate != task->m_task_type ;
+
+  //----------------------------------------
+
+  if ( runnable && lock != task->m_next ) {
+    // Is a runnable task has finished executing and requested respawn.
+    // Schedule the task for subsequent execution.
+
+    schedule( task );
+  }
+  //----------------------------------------
+  else {
+    // Is either an aggregate or a runnable task that executed
+    // and did not respawn.  Transition this task to complete.
+
+    // If 'task' is an aggregate then any of the runnable tasks that
+    // it depends upon may be attempting to complete this 'task'.
+    // Must only transition a task once to complete status.
+    // This is controled by atomically locking the wait queue.
+
+    // Stop other tasks from adding themselves to this task's wait queue
+    // by locking the head of this task's wait queue.
+
+    task_root_type * x = Kokkos::atomic_exchange( & task->m_wait, lock );
+
+    if ( x != (task_root_type *) lock ) {
+
+      // This thread has transitioned this 'task' to complete.
+      // 'task' is no longer in a queue and is not executing
+      // so decrement the reference count from 'task's creation.
+      // If no other references to this 'task' then it will be deleted.
+
+      TaskQueue::assign( & task, zero );
+
+      // This thread has exclusive access to the wait list so
+      // the concurrency-safe pop_task function is not needed.
+      // Schedule the tasks that have been waiting on the input 'task',
+      // which may have been deleted.
+
+      while ( x != end ) {
+
+        // Set x->m_next = zero  <=  no dependence
+
+        task_root_type * const next =
+          (task_root_type *) Kokkos::atomic_exchange( & x->m_next, zero );
+
+        schedule( x );
+
+        x = next ;
+      }
+    }
+  }
+
+  if ( runnable ) {
+    // A runnable task was popped from a ready queue and executed.
+    // If respawned into a ready queue then the ready count was incremented
+    // so decrement whether respawned or not.
+    Kokkos::atomic_decrement( & m_ready_count );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template<>
+aligned_t
+TaskBase< Kokkos::Qthreads, void, void >::qthread_func( void * arg )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = Kokkos::Impl::QthreadsTeamPolicyMember;
+
+  task_root_type * const task = reinterpret_cast< task_root_type * >( arg );
+
+  // First member of the team change state to executing.
+  // Use compare-exchange to avoid race condition with a respawn.
+  Kokkos::atomic_compare_exchange_strong( & task->m_state,
+                                          queue_type::TASK_STATE_WAITING,
+                                          queue_type::TASK_STATE_EXECUTING
+                                        );
+
+  if ( task_root_type::TaskTeam == task->m_task_type )
+  {
+    if ( 1 < task->m_queue->m_team_size ) {
+      // Team task with team size of more than 1.
+      Member::TaskTeam task_team_tag ;
+
+      // Initialize team size and rank with shephered info
+      Member member( task_team_tag );
+
+      (*task->m_apply)( task , & member );
+
+#if 0
+      fprintf( stdout,
+              "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n",
+              qthread_shep(),
+              qthread_worker_local(NULL),
+              reinterpret_cast<unsigned long>(task),
+              member.team_rank(),
+              member.team_size()
+            );
+      fflush(stdout);
+#endif
+
+      member.team_barrier();
+      if ( member.team_rank() == 0 ) task->closeout();
+      member.team_barrier();
+    }
+    else {
+      // Team task with team size of 1.
+      Member member ;
+      (*task->m_apply)( task , & member );
+      task->closeout();
+    }
+  }
+  else {
+    (*task->m_apply)( task );
+    task->closeout();
+  }
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx return\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       );
+fflush(stdout);
+#endif
+
+  return 0 ;
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
--- a/lib/kokkos/core/src/Qthreads/README
+++ b/lib/kokkos/core/src/Qthreads/README
@ -22,4 +22,3 @@ sh autogen.sh
 # install

 make install
-
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@ -264,7 +264,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );

  for ( int i = 0 ; i < n ; ++i ) {
-    Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+    Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
  }

  exec.m_pool_state = ThreadsExec::Inactive ;
@ -308,7 +308,7 @@ void ThreadsExec::fence()
 {
  if ( s_thread_pool_size[0] ) {
    // Wait for the root thread to complete:
-    Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+    Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
  }

  s_current_function     = 0 ;
@ -724,7 +724,7 @@ void ThreadsExec::initialize( unsigned thread_count ,
  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();

-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
  #endif
 }
@ -777,7 +777,7 @@ void ThreadsExec::finalize()
  s_threads_process.m_pool_fan_size   = 0 ;
  s_threads_process.m_pool_state = ThreadsExec::Inactive ;

-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
  #endif
 }
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@ -187,13 +187,13 @@ public:
      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the reduction and broadcast
@ -229,13 +229,13 @@ public:
      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the reduction and broadcast
@ -264,7 +264,7 @@ public:

        ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;

-        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );

        Join::join( f , reduce_memory() , fan.reduce_memory() );
      }
@ -280,7 +280,7 @@ public:
      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );

      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
-        Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
      }
    }

@ -312,7 +312,7 @@ public:
        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];

        // Wait: Active -> ReductionAvailable (or ScanAvailable)
-        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
        Join::join( f , work_value , fan.reduce_memory() );
      }

@ -330,8 +330,8 @@ public:

          // Wait: Active             -> ReductionAvailable
          // Wait: ReductionAvailable -> ScanAvailable
-          Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
-          Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
+          Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable );

          Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
        }
@ -342,7 +342,7 @@ public:

        // Wait for all threads to complete inclusive scan
        // Wait: ScanAvailable -> Rendezvous
-        Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable );
      }

      //--------------------------------
@ -350,7 +350,7 @@ public:
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
        // Wait: ReductionAvailable -> ScanAvailable
-        Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable );
        // Set: ScanAvailable -> Rendezvous
        fan.m_pool_state = ThreadsExec::Rendezvous ;
      }
@ -377,13 +377,13 @@ public:
      // Wait for all threads to copy previous thread's inclusive scan value
      // Wait for all threads: Rendezvous -> ScanCompleted
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
      }
      if ( rev_rank ) {
        // Set: ScanAvailable -> ScanCompleted
        m_pool_state = ThreadsExec::ScanCompleted ;
        // Wait: ScanCompleted -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted );
      }
      // Set: ScanCompleted -> Active
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
@ -410,7 +410,7 @@ public:
      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
@ -418,7 +418,7 @@ public:
      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the thread-scan before releasing threads
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@ -49,6 +49,7 @@
 #include <utility>
 #include <impl/Kokkos_spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>

 #include <Kokkos_Atomic.hpp>

@ -103,13 +104,13 @@ public:

      // Wait for fan-in threads
      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_team_base[j]->state() , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active );
      }

      // If not root then wait for release
      if ( m_team_rank_rev ) {
        m_exec->state() = ThreadsExec::Rendezvous ;
-        Impl::spinwait( m_exec->state() , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous );
      }

      return ! m_team_rank_rev ;
@ -350,6 +351,10 @@ public:
        const int team_rank_rev = pool_rank_rev % team.team_alloc();
        const size_t pool_league_size     = m_exec->pool_size() / team.team_alloc() ;
        const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
+        if(pool_league_rank_rev >= pool_league_size) {
+          m_invalid_thread = 1;
+          return;
+        }
        const size_t pool_league_rank     = pool_league_size - ( pool_league_rank_rev + 1 );

        const int pool_num_teams       = m_exec->pool_size()/team.team_alloc();
@ -505,7 +510,8 @@ private:
           , const int team_size_request )
   {
      const int pool_size  = traits::execution_space::thread_pool_size(0);
-      const int team_max   = traits::execution_space::thread_pool_size(1);
+      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
      const int team_grain = traits::execution_space::thread_pool_size(2);

      m_league_size = league_size_request ;
@ -552,8 +558,12 @@ public:

  template< class FunctorType >
  inline static
-  int team_size_max( const FunctorType & )
-    { return traits::execution_space::thread_pool_size(1); }
+  int team_size_max( const FunctorType & ) {
+      int pool_size = traits::execution_space::thread_pool_size(1);
+      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      return pool_size<max_host_team_size?pool_size:max_host_team_size;
+    }
+

  template< class FunctorType >
  static int team_size_recommended( const FunctorType & )
@ -819,9 +829,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::T
 #pragma ivdep
 #endif
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
+    lambda(i,result);
  }
 }

@ -835,18 +843,14 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::T
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& result ) {

-  ValueType result = init_result;
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
+    lambda(i,result);
  }
-  init_result = result;
 }

 /** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
--- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
--- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@ -56,12 +56,13 @@ int bit_scan_forward( unsigned i )
 {
 #if defined( __CUDA_ARCH__ )
  return __ffs(i) - 1;
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_ffs(i) - 1;
-#elif defined( __INTEL_COMPILER )
+#elif defined( KOKKOS_COMPILER_INTEL )
  return _bit_scan_forward(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return __cnttz4(i);
+#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_ffs(i) - 1;
 #else
-
  unsigned t = 1u;
  int r = 0;
  while ( i && ( i & t == 0 ) )
@ -79,10 +80,12 @@ int bit_scan_reverse( unsigned i )
  enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
 #if defined( __CUDA_ARCH__ )
  return shift - __clz(i);
+#elif defined( KOKKOS_COMPILER_INTEL )
+  return _bit_scan_reverse(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return shift - __cntlz4(i);
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
  return shift - __builtin_clz(i);
-#elif defined( __INTEL_COMPILER )
-  return _bit_scan_reverse(i);
 #else
  unsigned t = 1u << shift;
  int r = 0;
@ -101,10 +104,12 @@ int bit_count( unsigned i )
 {
 #if defined( __CUDA_ARCH__ )
  return __popc(i);
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_popcount(i);
 #elif defined ( __INTEL_COMPILER )
  return _popcnt32(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return __popcnt4(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_popcount(i);
 #else
  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
  i = i - ( ( i >> 1 ) & ~0u / 3u );                             // temp
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -147,7 +147,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
  }
 #endif

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
 #endif
 }
@ -155,7 +155,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
 void finalize_internal( const bool all_spaces = false )
 {

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
 #endif

@ -449,5 +449,323 @@ void fence()
  Impl::fence_internal();
 }

+void print_configuration( std::ostream & out , const bool detail )
+{
+  std::ostringstream msg;
+
+  msg << "Compiler:" << std::endl;
+#ifdef KOKKOS_COMPILER_APPLECC
+  msg << "  KOKKOS_COMPILER_APPLECC: " << KOKKOS_COMPILER_APPLECC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_CLANG
+  msg << "  KOKKOS_COMPILER_CLANG: " << KOKKOS_COMPILER_CLANG << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_CRAYC
+  msg << "  KOKKOS_COMPILER_CRAYC: " << KOKKOS_COMPILER_CRAYC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_GNU
+  msg << "  KOKKOS_COMPILER_GNU: " << KOKKOS_COMPILER_GNU << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_IBM
+  msg << "  KOKKOS_COMPILER_IBM: " << KOKKOS_COMPILER_IBM << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_INTEL
+  msg << "  KOKKOS_COMPILER_INTEL: " << KOKKOS_COMPILER_INTEL << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_NVCC
+  msg << "  KOKKOS_COMPILER_NVCC: " << KOKKOS_COMPILER_NVCC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_PGI
+  msg << "  KOKKOS_COMPILER_PGI: " << KOKKOS_COMPILER_PGI << std::endl;
+#endif
+
+
+  msg << "Architecture:" << std::endl;
+#ifdef KOKKOS_ENABLE_ISA_KNC
+  msg << "  KOKKOS_ENABLE_ISA_KNC: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_KNC: no" << std::endl;
+#endif
+#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
+  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: no" << std::endl;
+#endif
+#ifdef KOKKOS_ENABLE_ISA_X86_64
+  msg << "  KOKKOS_ENABLE_ISA_X86_64: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_X86_64: no" << std::endl;
+#endif
+
+
+  msg << "Devices:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA: ";
+#ifdef KOKKOS_ENABLE_CUDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_OPENMP: ";
+#ifdef KOKKOS_ENABLE_OPENMP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PTHREAD: ";
+#ifdef KOKKOS_ENABLE_PTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_STDTHREAD: ";
+#ifdef KOKKOS_ENABLE_STDTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_WINTHREAD: ";
+#ifdef KOKKOS_ENABLE_WINTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_QTHREADS: ";
+#ifdef KOKKOS_ENABLE_QTHREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_SERIAL: ";
+#ifdef KOKKOS_ENABLE_SERIAL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Default Device:" << std::endl;
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Atomics:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_GNU_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_GNU_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_INTEL_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_OPENMP_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_WINDOWS_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Vectorization:" << std::endl;
+  msg << "  KOKKOS_ENABLE_PRAGMA_IVDEP: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_LOOPCOUNT: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_SIMD: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_UNROLL: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_VECTOR: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+  msg << "Memory:" << std::endl;
+  msg << "  KOKKOS_ENABLE_HBWSPACE: ";
+#ifdef KOKKOS_ENABLE_HBWSPACE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_INTEL_MM_ALLOC: ";
+#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_POSIX_MEMALIGN: ";
+#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Options:" << std::endl;
+  msg << "  KOKKOS_ENABLE_ASM: ";
+#ifdef KOKKOS_ENABLE_ASM
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CXX1Z: ";
+#ifdef KOKKOS_ENABLE_CXX1Z
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK: ";
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_HWLOC: ";
+#ifdef KOKKOS_ENABLE_HWLOC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_LIBRT: ";
+#ifdef KOKKOS_ENABLE_LIBRT
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_MPI: ";
+#ifdef KOKKOS_ENABLE_MPI
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PROFILING: ";
+#ifdef KOKKOS_ENABLE_PROFILING
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+  msg << "Cuda Options:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
+#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_UVM: ";
+#ifdef KOKKOS_ENABLE_CUDA_UVM
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUSPARSE: ";
+#ifdef KOKKOS_ENABLE_CUSPARSE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+#endif
+
+  msg << "\nRuntime Configuration:" << std::endl;
+#ifdef KOKKOS_ENABLE_CUDA
+  Cuda::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+  OpenMP::print_configuration(msg, detail);
+#endif
+#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( WINTHREAD )
+  Threads::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_QTHREADS
+  Qthreads::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_SERIAL
+  Serial::print_configuration(msg, detail);
+#endif
+
+  out << msg.str() << std::endl;
+}
+
 } // namespace Kokkos

--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@ -0,0 +1,653 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_FUNCTORANALYSIS_HPP
+#define KOKKOS_FUNCTORANALYSIS_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_Reducer.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct FunctorPatternInterface {
+  struct FOR {};
+  struct REDUCE {};
+  struct SCAN {};
+};
+
+/** \brief  Query Functor and execution policy argument tag for value type.
+ *
+ *  If 'value_type' is not explicitly declared in the functor
+ *  then attempt to deduce the type from FunctorType::operator()
+ *  interface used by the pattern and policy.
+ *
+ *  For the REDUCE pattern generate a Reducer and finalization function
+ *  derived from what is available within the functor.
+ */
+template< typename PatternInterface , class Policy , class Functor >
+struct FunctorAnalysis {
+private:
+
+  using FOR    = FunctorPatternInterface::FOR ;
+  using REDUCE = FunctorPatternInterface::REDUCE ;
+  using SCAN   = FunctorPatternInterface::SCAN ;
+
+  //----------------------------------------
+
+  struct VOID {};
+
+  template< typename P = Policy , typename = std::false_type >
+  struct has_work_tag
+    {
+      using type = void ;
+      using wtag = VOID ;
+    };
+
+  template< typename P >
+  struct has_work_tag
+    < P , typename std::is_same< typename P::work_tag , void >::type >
+    {
+      using type = typename P::work_tag ;
+      using wtag = typename P::work_tag ;
+    };
+
+  using Tag  = typename has_work_tag<>::type ;
+  using WTag = typename has_work_tag<>::wtag ;
+
+  //----------------------------------------
+  // Check for Functor::value_type, which is either a simple type T or T[]
+
+  template< typename F , typename = std::false_type >
+  struct has_value_type { using type = void ; };
+
+  template< typename F >
+  struct has_value_type
+    < F , typename std::is_same< typename F::value_type , void >::type >
+  {
+    using type = typename F::value_type ;
+
+    static_assert( ! std::is_reference< type >::value &&
+                   std::rank< type >::value <= 1 &&
+                   std::extent< type >::value == 0
+                 , "Kokkos Functor::value_type is T or T[]" );
+  };
+
+  //----------------------------------------
+  // If Functor::value_type does not exist then evaluate operator(),
+  // depending upon the pattern and whether the policy has a work tag,
+  // to determine the reduction or scan value_type.
+
+  template< typename F
+          , typename P = PatternInterface
+          , typename V = typename has_value_type<F>::type
+          , bool     T = std::is_same< Tag , void >::value
+          >
+  struct deduce_value_type { using type = V ; };
+
+  template< typename F >
+  struct deduce_value_type< F , REDUCE , void , true > {
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , A & ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , REDUCE , void , false > {
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , A & ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , SCAN , void , true > {
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , A & , I ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , SCAN , void , false > {
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , A & , I ) const );
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , A & , I ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  //----------------------------------------
+
+  using candidate_type = typename deduce_value_type< Functor >::type ;
+
+  enum { candidate_is_void  = std::is_same< candidate_type , void >::value
+       , candidate_is_array = std::rank< candidate_type >::value == 1 };
+
+  //----------------------------------------
+
+public:
+
+  using value_type = typename std::remove_extent< candidate_type >::type ;
+
+  static_assert( ! std::is_const< value_type >::value
+               , "Kokkos functor operator reduce argument cannot be const" );
+
+private:
+
+  // Stub to avoid defining a type 'void &'
+  using ValueType = typename
+    std::conditional< candidate_is_void , VOID , value_type >::type ;
+
+public:
+
+  using pointer_type = typename
+    std::conditional< candidate_is_void , void , ValueType * >::type ;
+
+  using reference_type = typename
+    std::conditional< candidate_is_array  , ValueType * , typename
+    std::conditional< ! candidate_is_void , ValueType & , void >
+    ::type >::type ;
+
+private:
+
+  template< bool IsArray , class FF >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< IsArray , unsigned >::type
+  get_length( FF const & f ) { return f.value_count ; }
+
+  template< bool IsArray , class FF >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< ! IsArray , unsigned >::type
+  get_length( FF const & ) { return 1 ; }
+
+public:
+
+  enum { StaticValueSize = ! candidate_is_void &&
+                           ! candidate_is_array
+                         ? sizeof(ValueType) : 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const Functor & f )
+    { return FunctorAnalysis::template get_length< candidate_is_array >(f); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const Functor & f )
+    { return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }
+
+  //----------------------------------------
+
+  template< class Unknown >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const Unknown & )
+    { return 1 ; }
+
+  template< class Unknown >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const Unknown & )
+    { return sizeof(ValueType); }
+
+private:
+
+  enum INTERFACE : int
+    { DISABLE           = 0
+    , NO_TAG_NOT_ARRAY  = 1
+    , NO_TAG_IS_ARRAY   = 2
+    , HAS_TAG_NOT_ARRAY = 3
+    , HAS_TAG_IS_ARRAY  = 4
+    , DEDUCED =
+       ! std::is_same< PatternInterface , REDUCE >::value ? DISABLE : (
+       std::is_same<Tag,void>::value
+         ? (candidate_is_array ? NO_TAG_IS_ARRAY  : NO_TAG_NOT_ARRAY)
+         : (candidate_is_array ? HAS_TAG_IS_ARRAY : HAS_TAG_NOT_ARRAY) )
+    };
+
+  //----------------------------------------
+  // parallel_reduce join operator
+
+  template< class F , INTERFACE >
+  struct has_join_function ;
+
+  template< class F >
+  struct has_join_function< F , NO_TAG_NOT_ARRAY >
+    {
+      typedef volatile       ValueType & vref_type ;
+      typedef volatile const ValueType & cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( *dst , *src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , NO_TAG_IS_ARRAY >
+    {
+      typedef volatile       ValueType * vref_type ;
+      typedef volatile const ValueType * cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( dst , src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      typedef volatile       ValueType & vref_type ;
+      typedef volatile const ValueType & cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( WTag() , *dst , *src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , HAS_TAG_IS_ARRAY >
+    {
+      typedef volatile       ValueType * vref_type ;
+      typedef volatile const ValueType * cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( WTag() , dst , src ); }
+    };
+
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceJoin
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+       {
+         const int n = FunctorAnalysis::value_count( f );
+         for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
+       }
+    };
+
+  template< class F >
+  struct DeduceJoin< F , DISABLE , void >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const &
+               , ValueType volatile *
+               , ValueType volatile const * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceJoin< F , I ,
+    decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
+    : public has_join_function<F,I> {};
+
+  //----------------------------------------
+
+  template< class , INTERFACE >
+  struct has_init_function ;
+
+  template< class F >
+  struct has_init_function< F , NO_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( *dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , NO_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( WTag(), *dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , HAS_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( WTag(), dst ); }
+    };
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceInit
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & , ValueType * dst ) { new(dst) ValueType(); }
+    };
+
+  template< class F >
+  struct DeduceInit< F , DISABLE , void >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & , ValueType * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceInit< F , I ,
+    decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
+    : public has_init_function<F,I> {};
+
+  //----------------------------------------
+
+public:
+
+  struct Reducer
+  {
+  private:
+
+    Functor     const & m_functor ;
+    ValueType * const   m_result ;
+    int         const   m_length ;
+
+  public:
+
+    using reducer        = Reducer ;
+    using value_type     = FunctorAnalysis::value_type ;
+    using memory_space   = void ;
+    using reference_type = FunctorAnalysis::reference_type ;
+
+    KOKKOS_INLINE_FUNCTION
+    void join( ValueType volatile * dst
+             , ValueType volatile const * src ) const noexcept
+      { DeduceJoin<>::join( m_functor , dst , src ); }
+
+    KOKKOS_INLINE_FUNCTION
+    void init( ValueType * dst ) const noexcept
+      { DeduceInit<>::init( m_functor , dst ); }
+
+    KOKKOS_INLINE_FUNCTION explicit
+    constexpr Reducer( Functor const & arg_functor
+                     , ValueType     * arg_value = 0
+                     , int             arg_length = 0 ) noexcept
+      : m_functor( arg_functor ), m_result(arg_value), m_length(arg_length) {}
+
+    KOKKOS_INLINE_FUNCTION
+    constexpr int length() const noexcept { return m_length ; }
+
+    KOKKOS_INLINE_FUNCTION
+    ValueType & operator[]( int i ) const noexcept
+      { return m_result[i]; }
+
+  private:
+
+    template< bool IsArray >
+    constexpr
+    typename std::enable_if< IsArray , ValueType * >::type
+    ref() const noexcept { return m_result ; }
+
+    template< bool IsArray >
+    constexpr
+    typename std::enable_if< ! IsArray , ValueType & >::type
+    ref() const noexcept { return *m_result ; }
+
+  public:
+
+    KOKKOS_INLINE_FUNCTION
+    auto result() const noexcept
+      -> decltype( Reducer::template ref< candidate_is_array >() )
+      { return Reducer::template ref< candidate_is_array >(); }
+ };
+
+  //----------------------------------------
+
+private:
+
+  template< class , INTERFACE >
+  struct has_final_function ;
+
+  // No tag, not array
+  template< class F >
+  struct has_final_function< F , NO_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( *dst ); }
+    };
+
+  // No tag, is array
+  template< class F >
+  struct has_final_function< F , NO_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( dst ); }
+    };
+
+  // Has tag, not array
+  template< class F >
+  struct has_final_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( WTag(), *dst ); }
+    };
+
+  // Has tag, is array
+  template< class F >
+  struct has_final_function< F , HAS_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( WTag(), dst ); }
+    };
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceFinal
+    {
+      KOKKOS_INLINE_FUNCTION
+      static void final( F const & , ValueType * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceFinal< F , I ,
+    decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
+    : public has_init_function<F,I> {};
+
+public:
+
+  static void final( Functor const & f , ValueType * result )
+    { DeduceFinal<>::final( f , result ); }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_FUNCTORANALYSIS_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@ -62,7 +62,7 @@
 #include <memkind.h>
 #endif

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif

@ -249,7 +249,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::deallocateData(
      Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::HBWSpace::name()),RecordBase::m_alloc_ptr->m_label,
@ -278,7 +278,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
      )
  , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
  }
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@ -43,7 +43,7 @@

 #include <algorithm>
 #include <Kokkos_Macros.hpp>
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif
 /*--------------------------------------------------------------------------*/
@ -359,7 +359,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::HostSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::deallocateData(
      Kokkos::Profiling::SpaceHandle(Kokkos::HostSpace::name()),RecordBase::m_alloc_ptr->m_label,
@ -388,7 +388,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space
      )
  , m_space( arg_space )
 {
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
  if(Kokkos::Profiling::profileLibraryLoaded()) {
    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
   }
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@ -0,0 +1,463 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <limits>
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void HostThreadTeamData::organize_pool
+  ( HostThreadTeamData * members[] , const int size )
+{
+  bool ok = true ;
+
+  // Verify not already a member of a pool:
+  for ( int rank = 0 ; rank < size && ok ; ++rank ) {
+    ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
+  }
+
+  if ( ok ) {
+
+    int64_t * const root_scratch = members[0]->m_scratch ;
+
+    for ( int i = m_pool_rendezvous ; i < m_pool_reduce ; ++i ) {
+      root_scratch[i] = 0 ;
+    }
+
+    {
+      HostThreadTeamData ** const pool =
+        (HostThreadTeamData **) (root_scratch + m_pool_members);
+
+      // team size == 1, league size == pool_size
+
+      for ( int rank = 0 ; rank < size ; ++rank ) {
+        HostThreadTeamData * const mem = members[ rank ] ;
+        mem->m_pool_scratch = root_scratch ;
+        mem->m_team_scratch = mem->m_scratch ;
+        mem->m_pool_rank    = rank ;
+        mem->m_pool_size    = size ;
+        mem->m_team_base    = rank ;
+        mem->m_team_rank    = 0 ;
+        mem->m_team_size    = 1 ;
+        mem->m_team_alloc   = 1 ;
+        mem->m_league_rank  = rank ;
+        mem->m_league_size  = size ;
+        mem->m_pool_rendezvous_step = 0 ;
+        mem->m_team_rendezvous_step = 0 ;
+        pool[ rank ] = mem ;
+      }
+    }
+
+    Kokkos::memory_fence();
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_pool ERROR pool already exists");
+  }
+}
+
+void HostThreadTeamData::disband_pool()
+{
+   m_work_range.first  = -1 ;
+   m_work_range.second = -1 ;
+   m_pool_scratch = 0 ;
+   m_team_scratch = 0 ;
+   m_pool_rank    = 0 ;
+   m_pool_size    = 1 ;
+   m_team_base    = 0 ;
+   m_team_rank    = 0 ;
+   m_team_size    = 1 ;
+   m_team_alloc   = 1 ;
+   m_league_rank  = 0 ;
+   m_league_size  = 1 ;
+   m_pool_rendezvous_step = 0 ;
+   m_team_rendezvous_step = 0 ;
+}
+
+int HostThreadTeamData::organize_team( const int team_size )
+{
+  // Pool is initialized
+  const bool ok_pool = 0 != m_pool_scratch ;
+
+  // Team is not set
+  const bool ok_team =
+    m_team_scratch == m_scratch &&
+    m_team_base    == m_pool_rank &&
+    m_team_rank    == 0 &&
+    m_team_size    == 1 &&
+    m_team_alloc   == 1 &&
+    m_league_rank  == m_pool_rank &&
+    m_league_size  == m_pool_size ;
+
+  if ( ok_pool && ok_team ) {
+
+    if ( team_size <= 0 ) return 0 ; // No teams to organize
+
+    if ( team_size == 1 ) return 1 ; // Already organized in teams of one
+
+    HostThreadTeamData * const * const pool =
+      (HostThreadTeamData **) (m_pool_scratch + m_pool_members);
+
+    // "league_size" in this context is the number of concurrent teams
+    // that the pool can accommodate.  Excess threads are idle.
+    const int league_size     = m_pool_size / team_size ;
+    const int team_alloc_size = m_pool_size / league_size ;
+    const int team_alloc_rank = m_pool_rank % team_alloc_size ;
+    const int league_rank     = m_pool_rank / team_alloc_size ;
+    const int team_base_rank  = league_rank * team_alloc_size ;
+
+    m_team_scratch = pool[ team_base_rank ]->m_scratch ;
+    m_team_base    = team_base_rank ;
+    // This needs to check overflow, if m_pool_size % team_alloc_size !=0
+    // there are two corner cases:
+    // (i) if team_alloc_size == team_size there might be a non-full
+    //     zombi team around (for example m_pool_size = 5 and team_size = 2
+    // (ii) if team_alloc > team_size then the last team might have less
+    //      threads than the others
+    m_team_rank    = ( team_base_rank + team_size <= m_pool_size ) &&
+                     ( team_alloc_rank < team_size ) ?
+                     team_alloc_rank : -1;
+    m_team_size    = team_size ;
+    m_team_alloc   = team_alloc_size ;
+    m_league_rank  = league_rank ;
+    m_league_size  = league_size ;
+    m_team_rendezvous_step = 0 ;
+
+    if ( team_base_rank == m_pool_rank ) {
+      // Initialize team's rendezvous memory
+      for ( int i = m_team_rendezvous ; i < m_pool_reduce ; ++i ) {
+        m_scratch[i] = 0 ;
+      }
+      // Make sure team's rendezvous memory initialized
+      // is written before proceeding.
+      Kokkos::memory_fence();
+    }
+
+    // Organizing threads into a team performs a barrier across the
+    // entire pool to insure proper initialization of the team
+    // rendezvous mechanism before a team rendezvous can be performed.
+
+    if ( pool_rendezvous() ) {
+      pool_rendezvous_release();
+    }
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_team ERROR");
+  }
+
+  return 0 <= m_team_rank ;
+}
+
+void HostThreadTeamData::disband_team()
+{
+  m_team_scratch = m_scratch ;
+  m_team_base    = m_pool_rank ;
+  m_team_rank    = 0 ;
+  m_team_size    = 1 ;
+  m_team_alloc   = 1 ;
+  m_league_rank  = m_pool_rank ;
+  m_league_size  = m_pool_size ;
+  m_team_rendezvous_step = 0 ;
+}
+
+//----------------------------------------------------------------------------
+/* pattern for rendezvous
+ *
+ *  if ( rendezvous() ) {
+ *     ... all other threads are still in team_rendezvous() ...
+ *     rendezvous_release();
+ *     ... all other threads are released from team_rendezvous() ...
+ *  }
+ */
+
+int HostThreadTeamData::rendezvous( int64_t * const buffer
+                                  , int & rendezvous_step
+                                  , int const size
+                                  , int const rank ) noexcept
+{
+  enum : int { shift_byte = 3 };
+  enum : int { size_byte  = ( 01 << shift_byte ) }; // == 8
+  enum : int { mask_byte  = size_byte - 1 };
+
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+
+  // Cycle step values: 1 <= step <= size_val_cycle
+  // An odd multiple of memory cycle so that when a memory location
+  // is reused it has a different value.
+  // Must be representable within a single byte: size_val_cycle < 16
+
+  enum : int { size_val_cycle = 3 * size_mem_cycle };
+
+  // Requires:
+  //   Called by rank = [ 0 .. size )
+  //   buffer aligned to int64_t[4]
+
+  // A sequence of rendezvous uses four cycled locations in memory
+  // and non-equal cycled synchronization values to
+  // 1) prevent rendezvous from overtaking one another and
+  // 2) give each spin wait location an int64_t[4] span
+  //    so that it has its own cache line.
+
+  const int step = ( rendezvous_step % size_val_cycle ) + 1 ;
+
+  rendezvous_step = step ;
+
+  // The leading int64_t[4] span is for thread 0 to write
+  // and all other threads to read spin-wait.
+  // sync_offset is the index into this array for this step.
+
+  const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;
+
+  union {
+    int64_t full ;
+    int8_t  byte[8] ;
+  } value ;
+
+  if ( rank ) {
+
+    const int group_begin = rank << shift_byte ; // == rank * size_byte
+
+    if ( group_begin < size ) {
+
+      //  This thread waits for threads
+      //   [ group_begin .. group_begin + 8 )
+      //   [ rank*8      .. rank*8 + 8      )
+      // to write to their designated bytes.
+
+      const int end = group_begin + size_byte < size
+                    ? size_byte : size - group_begin ;
+
+      value.full = 0 ;
+      for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+
+      store_fence(); // This should not be needed but fixes #742
+
+      spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
+                          , value.full );
+    }
+
+    {
+      // This thread sets its designated byte.
+      //   ( rank % size_byte ) +
+      //   ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
+      //   ( sync_offset * size_byte )
+      const int offset = ( rank & mask_byte )
+                       + ( ( rank & ~mask_byte ) << shift_mem_cycle )
+                       + ( sync_offset << shift_byte );
+
+      // All of this thread's previous memory stores must be complete before
+      // this thread stores the step value at this thread's designated byte
+      // in the shared synchronization array.
+
+      Kokkos::memory_fence();
+
+      ((volatile int8_t*) buffer)[ offset ] = int8_t( step );
+
+      // Memory fence to push the previous store out
+      Kokkos::memory_fence();
+    }
+
+    // Wait for thread 0 to release all other threads
+
+    spinwait_until_equal( buffer[ step & mask_mem_cycle ] , int64_t(step) );
+
+  }
+  else {
+    // Thread 0 waits for threads [1..7]
+    // to write to their designated bytes.
+
+    const int end = size_byte < size ? 8 : size ;
+
+    value.full = 0 ;
+    for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+
+    spinwait_until_equal( buffer[ sync_offset ], value.full );
+  }
+
+  return rank ? 0 : 1 ;
+}
+
+void HostThreadTeamData::
+  rendezvous_release( int64_t * const buffer
+                    , int const rendezvous_step ) noexcept
+{
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+
+  // Requires:
+  //   Called after team_rendezvous
+  //   Called only by true == team_rendezvous(root)
+
+  // Memory fence to be sure all previous writes are complete:
+  Kokkos::memory_fence();
+
+  ((volatile int64_t*) buffer)[ rendezvous_step & mask_mem_cycle ] =
+     int64_t( rendezvous_step );
+
+  // Memory fence to push the store out
+  Kokkos::memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+int HostThreadTeamData::get_work_stealing() noexcept
+{
+  pair_int_t w( -1 , -1 );
+
+  if ( 1 == m_team_size || team_rendezvous() ) {
+
+    // Attempt first from beginning of my work range
+    for ( int attempt = m_work_range.first < m_work_range.second ; attempt ; ) {
+
+      // Query and attempt to update m_work_range
+      //   from: [ w.first     , w.second )
+      //   to:   [ w.first + 1 , w.second ) = w_new
+      //
+      // If w is invalid then is just a query.
+
+      const pair_int_t w_new( w.first + 1 , w.second );
+
+      w = Kokkos::atomic_compare_exchange( & m_work_range, w, w_new );
+
+      if ( w.first < w.second ) {
+        // m_work_range is viable
+
+        // If steal is successful then don't repeat attempt to steal
+        attempt = ! ( w_new.first  == w.first + 1 &&
+                      w_new.second == w.second );
+      }
+      else {
+        // m_work_range is not viable
+        w.first  = -1 ;
+        w.second = -1 ;
+
+        attempt = 0 ;
+      }
+    }
+
+    if ( w.first == -1 && m_steal_rank != m_pool_rank ) {
+
+      HostThreadTeamData * const * const pool =
+        (HostThreadTeamData**)( m_pool_scratch + m_pool_members );
+
+      // Attempt from begining failed, try to steal from end of neighbor
+
+      pair_int_t volatile * steal_range =
+        & ( pool[ m_steal_rank ]->m_work_range );
+
+      for ( int attempt = true ; attempt ; ) {
+
+        // Query and attempt to update steal_work_range
+        //   from: [ w.first , w.second )
+        //   to:   [ w.first , w.second - 1 ) = w_new
+        //
+        // If w is invalid then is just a query.
+
+        const pair_int_t w_new( w.first , w.second - 1 );
+
+        w = Kokkos::atomic_compare_exchange( steal_range, w, w_new );
+
+        if ( w.first < w.second ) {
+          // steal_work_range is viable
+
+          // If steal is successful then don't repeat attempt to steal
+          attempt = ! ( w_new.first  == w.first &&
+                        w_new.second == w.second - 1 );
+        }
+        else {
+          // steal_work_range is not viable, move to next member
+          w.first  = -1 ;
+          w.second = -1 ;
+
+          // We need to figure out whether the next team is active
+          // m_steal_rank + m_team_alloc could be the next base_rank to steal from
+          // but only if there are another m_team_size threads available so that that
+          // base rank has a full team.
+          m_steal_rank = m_steal_rank + m_team_alloc + m_team_size <= m_pool_size ?
+                         m_steal_rank + m_team_alloc : 0;
+
+          steal_range = & ( pool[ m_steal_rank ]->m_work_range );
+
+          // If tried all other members then don't repeat attempt to steal
+          attempt = m_steal_rank != m_pool_rank ;
+        }
+      }
+
+      if ( w.first != -1 ) w.first = w.second - 1 ;
+    }
+
+    if ( 1 < m_team_size ) {
+      // Must share the work index
+      *((int volatile *) team_reduce()) = w.first ;
+
+      team_rendezvous_release();
+    }
+  }
+  else if ( 1 < m_team_size ) {
+    w.first = *((int volatile *) team_reduce());
+  }
+
+  // May exit because successfully stole work and w is good.
+  // May exit because no work left to steal and w = (-1,-1).
+
+#if 0
+fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
+       , m_pool_rank , m_pool_size , w.first );
+fflush(stdout);
+#endif
+
+  return w.first ;
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@ -52,6 +52,10 @@ void memory_fence()
 {
 #if defined( __CUDA_ARCH__ )
  __threadfence();
+#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
+  asm volatile (
+	  "mfence" ::: "memory"
+  );
 #elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
      ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
  __sync_synchronize();
--- a/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
@ -129,8 +129,8 @@
 #endif

 #ifdef KOKKOS_HAVE_CUDA_RDC
-#ifndef KOKKOS_ENABLE_CUDA_RDC
-#define KOKKOS_ENABLE_CUDA_RDC KOKKOS_HAVE_CUDA_RDC
+#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE KOKKOS_HAVE_CUDA_RDC
 #endif
 #endif

@ -242,9 +242,9 @@
 #endif
 #endif

-#ifdef KOKKOS_HAVE_QTHREAD
-#ifndef KOKKOS_ENABLE_QTHREAD
-#define KOKKOS_ENABLE_QTHREAD KOKKOS_HAVE_QTHREAD
+#ifdef KOKKOS_HAVE_QTHREADS
+#ifndef KOKKOS_ENABLE_QTHREADS
+#define KOKKOS_ENABLE_QTHREADS KOKKOS_HAVE_QTHREADS
 #endif
 #endif

--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@ -43,7 +43,7 @@

 #include <impl/Kokkos_Profiling_Interface.hpp>

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <string.h>

 namespace Kokkos {
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@ -50,7 +50,7 @@
 #include <string>
 #include <cinttypes>

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_DeviceInfo.hpp>
 #include <dlfcn.h>
 #include <iostream>
@ -59,7 +59,7 @@

 #define KOKKOSP_INTERFACE_VERSION 20150628

-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 namespace Kokkos {
  namespace Profiling {

--- a/lib/kokkos/core/src/impl/Kokkos_Reducer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Reducer.hpp
@ -0,0 +1,317 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_REDUCER_HPP
+#define KOKKOS_IMPL_REDUCER_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+/*  Reducer abstraction:
+ *  1) Provides 'join' operation
+ *  2) Provides 'init' operation
+ *  3) Provides 'copy' operation
+ *  4) Optionally provides result value in a memory space
+ *
+ *  Created from:
+ *  1) Functor::operator()( destination , source )
+ *  2) Functor::{ join , init )
+ */
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename value_type >
+struct ReduceSum
+{
+  KOKKOS_INLINE_FUNCTION static
+  void copy( value_type & dest
+           , value_type const & src ) noexcept
+    { dest = src ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  void init( value_type & dest ) noexcept
+    { new( &dest ) value_type(); }
+
+  KOKKOS_INLINE_FUNCTION static
+  void join( value_type volatile & dest
+           , value_type const volatile & src ) noexcept
+    { dest += src ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  void join( value_type & dest
+           , value_type const & src ) noexcept
+    { dest += src ; }
+};
+
+template< typename T
+        , class ReduceOp = ReduceSum< T >
+        , typename MemorySpace = void >
+struct Reducer
+  : private ReduceOp
+  , private integral_nonzero_constant
+    < int , ( std::rank<T>::value == 1 ? std::extent<T>::value : 1 )>
+{
+private:
+
+  // Determine if T is simple array
+
+  enum : int { rank = std::rank<T>::value };
+
+  static_assert( rank <= 1 , "Kokkos::Impl::Reducer type is at most rank-one" );
+
+  using length_t =
+    integral_nonzero_constant<int,( rank == 1 ? std::extent<T>::value : 1 )> ;
+
+public:
+
+  using reducer        = Reducer ;
+  using memory_space   = MemorySpace ;
+  using value_type     = typename std::remove_extent<T>::type ;
+  using reference_type =
+    typename std::conditional< ( rank != 0 )
+                             , value_type *
+                             , value_type &
+                             >::type ;
+private:
+
+  //--------------------------------------------------------------------------
+  // Determine what functions 'ReduceOp' provides:
+  //   copy( destination , source )
+  //   init( destination )
+  //
+  //   operator()( destination , source )
+  //   join( destination , source )
+  //
+  // Provide defaults for missing optional operations
+
+  template< class R , typename = void>
+  struct COPY {
+    KOKKOS_INLINE_FUNCTION static
+    void copy( R const &
+             , value_type * dst
+             , value_type const * src ) { *dst = *src ; }
+  };
+
+  template< class R >
+  struct COPY< R , decltype( ((R*)0)->copy( *((value_type*)0)
+                                          , *((value_type const *)0) ) ) >
+  {
+    KOKKOS_INLINE_FUNCTION static
+    void copy( R const & r
+             , value_type * dst
+             , value_type const * src ) { r.copy( *dst , *src ); }
+  };
+
+  template< class R , typename = void >
+  struct INIT {
+    KOKKOS_INLINE_FUNCTION static
+    void init( R const & , value_type * dst ) { new(dst) value_type(); }
+  };
+
+  template< class R >
+  struct INIT< R , decltype( ((R*)0)->init( *((value_type*)0 ) ) ) >
+  {
+    KOKKOS_INLINE_FUNCTION static
+    void init( R const & r , value_type * dst ) { r.init( *dst ); }
+  };
+
+  template< class R , typename V , typename = void > struct JOIN
+    {
+      // If no join function then try operator()
+      KOKKOS_INLINE_FUNCTION static
+      void join( R const & r , V * dst , V const * src )
+        { r.operator()(*dst,*src); }
+    };
+
+  template< class R , typename V >
+  struct JOIN< R , V , decltype( ((R*)0)->join ( *((V *)0) , *((V const *)0) ) ) >
+    {
+      // If has join function use it
+      KOKKOS_INLINE_FUNCTION static
+      void join( R const & r , V * dst , V const * src )
+        { r.join(*dst,*src); }
+    };
+
+  //--------------------------------------------------------------------------
+
+  value_type * const m_result ;
+
+  template< int Rank >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr
+  typename std::enable_if< ( 0 != Rank ) , reference_type >::type
+  ref( value_type * p ) noexcept { return p ; }
+
+  template< int Rank >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr
+  typename std::enable_if< ( 0 == Rank ) , reference_type >::type
+  ref( value_type * p ) noexcept { return *p ; }
+
+public:
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr int length() const noexcept
+     { return length_t::value ; }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type * data() const noexcept
+    { return m_result ; }
+
+  KOKKOS_INLINE_FUNCTION
+  reference_type reference() const noexcept
+    { return Reducer::template ref< rank >( m_result ); }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void copy( value_type * const dest
+           , value_type const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template COPY<ReduceOp>::copy( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type * dest ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template INIT<ReduceOp>::init( (ReduceOp &) *this , dest + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type * const dest
+           , value_type const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template JOIN<ReduceOp,value_type>::join( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type volatile * const dest
+           , value_type volatile const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template JOIN<ReduceOp,value_type volatile>::join( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  //--------------------------------------------------------------------------
+
+  template< typename ArgT >
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer
+    ( ArgT * arg_value
+    , typename std::enable_if
+        < std::is_same<ArgT,value_type>::value &&
+          std::is_default_constructible< ReduceOp >::value
+        , int >::type arg_length = 1
+    ) noexcept
+    : ReduceOp(), length_t( arg_length ), m_result( arg_value ) {}
+
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer( ReduceOp const & arg_op
+                   , value_type     * arg_value = 0
+                   , int arg_length = 1 ) noexcept
+    : ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
+
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer( ReduceOp      && arg_op
+                   , value_type     * arg_value = 0
+                   , int arg_length = 1 ) noexcept
+    : ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
+
+  Reducer( Reducer const & ) = default ;
+  Reducer( Reducer && ) = default ;
+  Reducer & operator = ( Reducer const & ) = default ;
+  Reducer & operator = ( Reducer && ) = default ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename ValueType >
+constexpr
+Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >
+Sum( ValueType & arg_value )
+{
+  static_assert( std::is_trivial<ValueType>::value
+               , "Kokkos reducer requires trivial value type" );
+  return Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & arg_value );
+}
+
+template< typename ValueType >
+constexpr
+Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >
+Sum( ValueType * arg_value , int arg_length )
+{
+  static_assert( std::is_trivial<ValueType>::value
+               , "Kokkos reducer requires trivial value type" );
+  return Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >( arg_value , arg_length );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ValueType , class JoinType >
+Impl::Reducer< ValueType , JoinType >
+reducer( ValueType & value , JoinType const & lambda )
+{
+  return Impl::Reducer< ValueType , JoinType >( lambda , & value );
+}
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_IMPL_REDUCER_HPP */
+
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@ -53,63 +53,126 @@

 namespace Kokkos {
 namespace Impl {
-namespace SerialImpl {
+namespace {

-Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
+HostThreadTeamData g_serial_thread_team_data ;

-Sentinel::~Sentinel()
-{
-  if ( m_scratch ) { free( m_scratch ); }
-  m_scratch = 0 ;
-  m_reduce_end = 0 ;
-  m_shared_end = 0 ;
 }

-Sentinel & Sentinel::singleton()
+// Resize thread team data scratch memory
+void serial_resize_thread_team_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes )
 {
-  static Sentinel s ; return s ;
-}
+  if ( pool_reduce_bytes < 512 ) pool_reduce_bytes = 512 ;
+  if ( team_reduce_bytes < 512 ) team_reduce_bytes = 512 ;

-inline
-unsigned align( unsigned n )
-{
-  enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
-  return ( n + MASK ) & ~MASK ;
-}
+  const size_t old_pool_reduce  = g_serial_thread_team_data.pool_reduce_bytes();
+  const size_t old_team_reduce  = g_serial_thread_team_data.team_reduce_bytes();
+  const size_t old_team_shared  = g_serial_thread_team_data.team_shared_bytes();
+  const size_t old_thread_local = g_serial_thread_team_data.thread_local_bytes();
+  const size_t old_alloc_bytes  = g_serial_thread_team_data.scratch_bytes();

-} // namespace
+  // Allocate if any of the old allocation is tool small:

-SerialTeamMember::SerialTeamMember( int arg_league_rank
-                                  , int arg_league_size
-                                  , int arg_shared_size
-                                  )
-  : m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
-           , arg_shared_size )
-  , m_league_rank( arg_league_rank )
-  , m_league_size( arg_league_size )
-{}
+  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
+                        ( old_team_reduce  < team_reduce_bytes ) ||
+                        ( old_team_shared  < team_shared_bytes ) ||
+                        ( old_thread_local < thread_local_bytes );

-} // namespace Impl
+  if ( allocate ) {

-void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
-{
-  static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
+    Kokkos::HostSpace space ;

-  reduce_size = Impl::SerialImpl::align( reduce_size );
-  shared_size = Impl::SerialImpl::align( shared_size );
+    if ( old_alloc_bytes ) {
+      g_serial_thread_team_data.disband_team();
+      g_serial_thread_team_data.disband_pool();

-  if ( ( s.m_reduce_end < reduce_size ) ||
-       ( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
-
-    if ( s.m_scratch ) { free( s.m_scratch ); }
-
-    if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
-    if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
-
-    s.m_scratch = malloc( s.m_shared_end );
+      space.deallocate( g_serial_thread_team_data.scratch_buffer()
+                      , g_serial_thread_team_data.scratch_bytes() );
    }

-  return s.m_scratch ;
+    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
+    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
+    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
+    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
+
+    const size_t alloc_bytes =
+      HostThreadTeamData::scratch_size( pool_reduce_bytes
+                                      , team_reduce_bytes
+                                      , team_shared_bytes
+                                      , thread_local_bytes );
+
+    void * const ptr = space.allocate( alloc_bytes );
+
+    g_serial_thread_team_data.
+      scratch_assign( ((char *)ptr)
+                    , alloc_bytes
+                    , pool_reduce_bytes
+                    , team_reduce_bytes
+                    , team_shared_bytes
+                    , thread_local_bytes );
+
+    HostThreadTeamData * pool[1] = { & g_serial_thread_team_data };
+
+    g_serial_thread_team_data.organize_pool( pool , 1 );
+    g_serial_thread_team_data.organize_team(1);
+  }
+}
+
+// Get thread team data structure for omp_get_thread_num()
+HostThreadTeamData * serial_get_thread_team_data()
+{
+  return & g_serial_thread_team_data ;
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+int Serial::is_initialized()
+{
+  return 1 ;
+}
+
+void Serial::initialize( unsigned threads_count
+                       , unsigned use_numa_count
+                       , unsigned use_cores_per_numa
+                       , bool allow_asynchronous_threadpool )
+{
+  (void) threads_count;
+  (void) use_numa_count;
+  (void) use_cores_per_numa;
+  (void) allow_asynchronous_threadpool;
+
+  // Init the array of locks used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
+
+void Serial::finalize()
+{
+  if ( Impl::g_serial_thread_team_data.scratch_buffer() ) {
+    Impl::g_serial_thread_team_data.disband_team();
+    Impl::g_serial_thread_team_data.disband_pool();
+
+    Kokkos::HostSpace space ;
+
+    space.deallocate( Impl::g_serial_thread_team_data.scratch_buffer()
+                    , Impl::g_serial_thread_team_data.scratch_bytes() );
+
+    Impl::g_serial_thread_team_data.scratch_assign( (void*) 0, 0, 0, 0, 0, 0 );
+  }
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
 }

 } // namespace Kokkos
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
@ -62,11 +62,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
  using execution_space = Kokkos::Serial ;
  using queue_type      = TaskQueue< execution_space > ;
  using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;

  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;

-  Member exec ;
+  Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
+
+  Member exec( *data );

  // Loop until all queues are empty
  while ( 0 < queue->m_ready_count ) {
@ -75,13 +77,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute

    for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
      for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-        task = queue_type::pop_task( & queue->m_ready[i][j] );
+        task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
      }
    }

    if ( end != task ) {

-      // pop_task resulted in lock == task->m_next
+      // pop_ready_task resulted in lock == task->m_next
      // In the executing state

      (*task->m_apply)( task , & exec );
@ -113,11 +115,13 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
  using execution_space = Kokkos::Serial ;
  using queue_type      = TaskQueue< execution_space > ;
  using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;

  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;

-  Member exec ;
+  Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
+
+  Member exec( *data );

  // Loop until no runnable task

@ -129,7 +133,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::

    for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
      for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-        task = queue_type::pop_task( & queue->m_ready[i][j] );
+        task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
      }
    }

--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@ -65,6 +65,7 @@ public:
  using memory_space    = Kokkos::HostSpace ;
  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;

  static
  void iff_single_thread_recursive_execute( queue_type * const );
@ -72,237 +73,19 @@ public:
  static
  void execute( queue_type * const );

-  template< typename FunctorType >
+  template< typename TaskType >
  static
-  void proc_set_apply( task_base_type::function_type * ptr )
-    {
-      using TaskType = TaskBase< Kokkos::Serial
-                               , typename FunctorType::value_type
-                               , FunctorType
-                               > ;
-       *ptr = TaskType::apply ;
-    }
+  typename TaskType::function_type
+  get_function_pointer() { return TaskType::apply ; }
 };

 extern template class TaskQueue< Kokkos::Serial > ;

-//----------------------------------------------------------------------------
-
-template<>
-class TaskExec< Kokkos::Serial >
-{
-public:
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
-};
-
-template<typename iType>
-struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
-{
-  typedef iType index_type;
-  const iType start ;
-  const iType end ;
-  enum {increment = 1};
-  //const  TaskExec< Kokkos::Serial > & thread;
-  TaskExec< Kokkos::Serial > & thread;
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct
-    //( const TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    : start(0)
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct
-    //( const TaskExec< Kokkos::Serial > & arg_thread
-    ( TaskExec< Kokkos::Serial > & arg_thread
-    , const iType& arg_start
-    , const iType & arg_end
-    )
-    : start( arg_start )
-    , end(   arg_end)
-    , thread( arg_thread )
-    {}
-};
-
-//----------------------------------------------------------------------------
-
-template<typename iType>
-struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
-{
-  typedef iType index_type;
-  const iType start ;
-  const iType end ;
-  enum {increment = 1};
-  TaskExec< Kokkos::Serial > & thread;
-
-  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct
-    ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    : start( 0 )
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-};
-
 }} /* namespace Kokkos::Impl */

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

-namespace Kokkos {
-
-// OMP version needs non-const TaskExec
-template< typename iType >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >
-TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >( thread, count );
-}
-
-// OMP version needs non-const TaskExec
-template< typename iType1, typename iType2 >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::TaskExec< Kokkos::Serial > >
-TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType1 & start, const iType2 & end )
-{
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >(
-           thread, iType(start), iType(end) );
-}
-
-// OMP version needs non-const TaskExec
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
-ThreadVectorRange
-  ( Impl::TaskExec< Kokkos::Serial > & thread
-  , const iType & count )
-{
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
-}
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i, result);
-
-  initialized_result = result;
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i, result);
-
-  initialized_result = result;
-}
-
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-  initialized_result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    initialized_result+=tmp;
-  }
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  ValueType result = initialized_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  initialized_result = result;
-}
-
-template< typename ValueType, typename iType, class Lambda >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda)
-{
-  ValueType accum = 0 ;
-  ValueType val, local_total;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    local_total = 0;
-    lambda(i,local_total,false);
-    val = accum;
-    lambda(i,val,true);
-    accum += local_total;
-  }
-
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda)
-{
-}
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */

--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp
@ -1,693 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_HPP
-#define KOKKOS_SYNCHRONIC_HPP
-
-#include <impl/Kokkos_Synchronic_Config.hpp>
-
-#include <atomic>
-#include <chrono>
-#include <thread>
-#include <functional>
-#include <algorithm>
-
-namespace Kokkos {
-namespace Impl {
-
-enum notify_hint {
-  notify_all,
-  notify_one,
-  notify_none
-};
-enum expect_hint {
-  expect_urgent,
-  expect_delay
-};
-
-namespace Details {
-
-template <class S, class T>
-bool __synchronic_spin_wait_for_update(S const& arg, T const& nval, int attempts) noexcept {
-  int i = 0;
-  for(;i < __SYNCHRONIC_SPIN_RELAX(attempts); ++i)
-    if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
-      return true;
-    else
-      __synchronic_relax();
-  for(;i < attempts; ++i)
-    if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
-      return true;
-    else
-      __synchronic_yield();
-  return false;
-}
-
-struct __exponential_backoff {
-  __exponential_backoff(int arg_maximum=512) : maximum(arg_maximum), microseconds(8), x(123456789), y(362436069), z(521288629) {
-  }
-  static inline void sleep_for(std::chrono::microseconds const& time) {
-    auto t = time.count();
-    if(__builtin_expect(t > 75,0)) {
-      portable_sleep(time);
-    }
-    else if(__builtin_expect(t > 25,0))
-      __synchronic_yield();
-    else
-      __synchronic_relax();
-  }
-  void sleep_for_step() {
-    sleep_for(step());
-  }
-  std::chrono::microseconds step() {
-    float const f = ranfu();
-    int const t = int(microseconds * f);
-    if(__builtin_expect(f >= 0.95f,0))
-      microseconds = 8;
-    else
-      microseconds = (std::min)(microseconds>>1,maximum);
-    return std::chrono::microseconds(t);
-  }
-private :
-  int maximum, microseconds, x, y, z;
-  int xorshf96() {
-    int t;
-    x ^= x << 16; x ^= x >> 5; x ^= x << 1;
-    t = x; x = y; y = z; z = t ^ x ^ y;
-    return z;
-  }
-  float ranfu() {
-    return (float)(xorshf96()&(~0UL>>1)) / (float)(~0UL>>1);
-  }
-};
-
-template <class T, class Enable = void>
-struct __synchronic_base {
-
-protected:
-  std::atomic<T> atom;
-
-  void notify(notify_hint = notify_all) noexcept {
-  }
-  void notify(notify_hint = notify_all) volatile noexcept {
-  }
-
-public :
-  __synchronic_base() noexcept = default;
-  constexpr __synchronic_base(T v) noexcept : atom(v) { }
-  __synchronic_base(const __synchronic_base&) = delete;
-  ~__synchronic_base() { }
-  __synchronic_base& operator=(const __synchronic_base&) = delete;
-  __synchronic_base& operator=(const __synchronic_base&) volatile = delete;
-
-  void expect_update(T val, expect_hint = expect_urgent) const noexcept {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    while(atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-    }
-  }
-  void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    while(atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-    }
-  }
-
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-};
-
-#ifdef __SYNCHRONIC_COMPATIBLE
-template <class T>
-struct __synchronic_base<T, typename std::enable_if<__SYNCHRONIC_COMPATIBLE(T)>::type> {
-
-public:
-  std::atomic<T> atom;
-
-  void notify(notify_hint hint = notify_all) noexcept {
-    if(__builtin_expect(hint == notify_none,1))
-      return;
-    auto const x = count.fetch_add(0,std::memory_order_acq_rel);
-    if(__builtin_expect(x,0)) {
-      if(__builtin_expect(hint == notify_all,1))
-        __synchronic_wake_all(&atom);
-      else
-        __synchronic_wake_one(&atom);
-    }
-  }
-  void notify(notify_hint hint = notify_all) volatile noexcept {
-    if(__builtin_expect(hint == notify_none,1))
-      return;
-    auto const x = count.fetch_add(0,std::memory_order_acq_rel);
-    if(__builtin_expect(x,0)) {
-      if(__builtin_expect(hint == notify_all,1))
-        __synchronic_wake_all_volatile(&atom);
-      else
-        __synchronic_wake_one_volatile(&atom);
-    }
-  }
-
-public :
-  __synchronic_base() noexcept : count(0) { }
-  constexpr __synchronic_base(T v) noexcept : atom(v), count(0) { }
-  __synchronic_base(const __synchronic_base&) = delete;
-  ~__synchronic_base() { }
-  __synchronic_base& operator=(const __synchronic_base&) = delete;
-  __synchronic_base& operator=(const __synchronic_base&) volatile = delete;
-
-  void expect_update(T val, expect_hint = expect_urgent) const noexcept {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait(&atom,val);
-      count.fetch_add(-1,std::memory_order_acquire);
-    }
-  }
-  void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_volatile(&atom,val);
-      count.fetch_add(-1,std::memory_order_acquire);
-    }
-  }
-
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_timed(&atom,val,remains);
-      count.fetch_add(-1,std::memory_order_acquire);
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_timed_volatile(&atom,val,remains);
-      count.fetch_add(-1,std::memory_order_acquire);
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-private:
-  mutable std::atomic<int> count;
-};
-#endif
-
-template <class T, class Enable = void>
-struct __synchronic : public __synchronic_base<T> {
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-};
-
-template <class T>
-struct __synchronic<T,typename std::enable_if<std::is_integral<T>::value>::type> : public __synchronic_base<T> {
-
-  T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_and(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_and(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_or(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_or(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_xor(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_xor(v,m);
-    this->notify(n);
-    return t;
-  }
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-
-  T operator=(T v) volatile noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T operator=(T v) noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T operator++(int) volatile noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T operator++(int) noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T operator--(int) volatile noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T operator--(int) noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T operator++() volatile noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T operator++() noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T operator--() volatile noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T operator--() noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T operator+=(T v) volatile noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T operator+=(T v) noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T operator-=(T v) volatile noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T operator-=(T v) noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T operator&=(T v) volatile noexcept {
-    auto const t = this->atom &= v;
-    this->notify();
-    return t;
-  }
-  T operator&=(T v) noexcept {
-    auto const t = this->atom &= v;
-    this->notify();
-    return t;
-  }
-  T operator|=(T v) volatile noexcept {
-    auto const t = this->atom |= v;
-    this->notify();
-    return t;
-  }
-  T operator|=(T v) noexcept {
-    auto const t = this->atom |= v;
-    this->notify();
-    return t;
-  }
-  T operator^=(T v) volatile noexcept {
-    auto const t = this->atom ^= v;
-    this->notify();
-    return t;
-  }
-  T operator^=(T v) noexcept {
-    auto const t = this->atom ^= v;
-    this->notify();
-    return t;
-  }
-};
-
-template <class T>
-struct __synchronic<T*> : public __synchronic_base<T*> {
-
-  T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T* v) noexcept : __synchronic_base<T*>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-
-  T* operator=(T* v) volatile noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T* operator=(T* v) noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T* operator++(int) volatile noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator++(int) noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator--(int) volatile noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator--(int) noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator++() volatile noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T* operator++() noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T* operator--() volatile noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T* operator--() noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T* operator+=(ptrdiff_t v) volatile noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T* operator+=(ptrdiff_t v) noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T* operator-=(ptrdiff_t v) volatile noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T* operator-=(ptrdiff_t v) noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-};
-
-} //namespace Details
-
-template <class T>
-struct synchronic : public Details::__synchronic<T> {
-
-  bool is_lock_free() const volatile noexcept { return this->atom.is_lock_free(); }
-  bool is_lock_free() const noexcept { return this->atom.is_lock_free(); }
-  void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    this->atom.store(v,m);
-    this->notify(n);
-  }
-  void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    this->atom.store(v,m);
-    this->notify(n);
-  }
-  T load(std::memory_order m = std::memory_order_seq_cst) const volatile noexcept { return this->atom.load(m); }
-  T load(std::memory_order m = std::memory_order_seq_cst) const noexcept { return this->atom.load(m); }
-
-  operator T() const volatile noexcept { return (T)this->atom; }
-  operator T() const noexcept { return (T)this->atom; }
-
-  T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.exchange(v,m);
-    this->notify(n);
-    return t;
-  }
-  T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.exchange(v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m1, m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m);
-    this->notify(n);
-    return t;
-  }
-
-  synchronic() noexcept = default;
-  constexpr synchronic(T val) noexcept : Details::__synchronic<T>(val) { }
-  synchronic(const synchronic&) = delete;
-  ~synchronic() { }
-  synchronic& operator=(const synchronic&) = delete;
-  synchronic& operator=(const synchronic&) volatile = delete;
-  T operator=(T val) noexcept {
-    return Details::__synchronic<T>::operator=(val);
-  }
-  T operator=(T val) volatile noexcept {
-    return Details::__synchronic<T>::operator=(val);
-  }
-
-  T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
-    Details::__synchronic<T>::expect_update(val,h);
-    return load(order);
-  }
-  T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
-    Details::__synchronic<T>::expect_update(val,h);
-    return load(order);
-  }
-  T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
-    for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
-      Details::__synchronic<T>::expect_update(nval,h);
-    return load(order);
-  }
-  T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
-    for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
-      expect_update(nval,h);
-    return load(order);
-  }
-  template <class Rep, class Period>
-  void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const {
-    Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
-  }
-  template < class Rep, class Period>
-  void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const volatile {
-    Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
-  }
-};
-
-#include <inttypes.h>
-
-typedef synchronic<char> synchronic_char;
-typedef synchronic<char> synchronic_schar;
-typedef synchronic<unsigned char> synchronic_uchar;
-typedef synchronic<short> synchronic_short;
-typedef synchronic<unsigned short> synchronic_ushort;
-typedef synchronic<int> synchronic_int;
-typedef synchronic<unsigned int> synchronic_uint;
-typedef synchronic<long> synchronic_long;
-typedef synchronic<unsigned long> synchronic_ulong;
-typedef synchronic<long long> synchronic_llong;
-typedef synchronic<unsigned long long> synchronic_ullong;
-//typedef synchronic<char16_t> synchronic_char16_t;
-//typedef synchronic<char32_t> synchronic_char32_t;
-typedef synchronic<wchar_t> synchronic_wchar_t;
-
-typedef synchronic<int_least8_t> synchronic_int_least8_t;
-typedef synchronic<uint_least8_t> synchronic_uint_least8_t;
-typedef synchronic<int_least16_t> synchronic_int_least16_t;
-typedef synchronic<uint_least16_t> synchronic_uint_least16_t;
-typedef synchronic<int_least32_t> synchronic_int_least32_t;
-typedef synchronic<uint_least32_t> synchronic_uint_least32_t;
-//typedef synchronic<int_least_64_t> synchronic_int_least_64_t;
-typedef synchronic<uint_least64_t> synchronic_uint_least64_t;
-typedef synchronic<int_fast8_t> synchronic_int_fast8_t;
-typedef synchronic<uint_fast8_t> synchronic_uint_fast8_t;
-typedef synchronic<int_fast16_t> synchronic_int_fast16_t;
-typedef synchronic<uint_fast16_t> synchronic_uint_fast16_t;
-typedef synchronic<int_fast32_t> synchronic_int_fast32_t;
-typedef synchronic<uint_fast32_t> synchronic_uint_fast32_t;
-typedef synchronic<int_fast64_t> synchronic_int_fast64_t;
-typedef synchronic<uint_fast64_t> synchronic_uint_fast64_t;
-typedef synchronic<intptr_t> synchronic_intptr_t;
-typedef synchronic<uintptr_t> synchronic_uintptr_t;
-typedef synchronic<size_t> synchronic_size_t;
-typedef synchronic<ptrdiff_t> synchronic_ptrdiff_t;
-typedef synchronic<intmax_t> synchronic_intmax_t;
-typedef synchronic<uintmax_t> synchronic_uintmax_t;
-
-}
-}
-
-#endif //__SYNCHRONIC_H
--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp
@ -1,169 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_CONFIG_H
-#define KOKKOS_SYNCHRONIC_CONFIG_H
-
-#include <thread>
-#include <chrono>
-
-namespace Kokkos {
-namespace Impl {
-
-//the default yield function used inside the implementation is the Standard one
-#define __synchronic_yield std::this_thread::yield
-#define __synchronic_relax __synchronic_yield
-
-#if defined(_MSC_VER)
-    //this is a handy GCC optimization that I use inside the implementation
-    #define __builtin_expect(condition,common) condition
-    #if _MSC_VER <= 1800
-        //using certain keywords that VC++ temporarily doesn't support
-        #define _ALLOW_KEYWORD_MACROS
-        #define noexcept
-        #define constexpr
-    #endif
-    //yes, I define multiple assignment operators
-    #pragma warning(disable:4522)
-    //I don't understand how Windows is so bad at timing functions, but is OK
-    //with straight-up yield loops
-    #define __do_backoff(b) __synchronic_yield()
-#else
-#define __do_backoff(b) b.sleep_for_step()
-#endif
-
-//certain platforms have efficient support for spin-waiting built into the operating system
-#if defined(__linux__) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0602)
-#if defined(_WIN32_WINNT)
-#include <winsock2.h>
-#include <Windows.h>
-    //the combination of WaitOnAddress and WakeByAddressAll is supported on Windows 8.1+
-    #define __synchronic_wait(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
-    #define __synchronic_wait_timed(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
-    #define __synchronic_wake_one(x) WakeByAddressSingle((PVOID)x)
-    #define __synchronic_wake_all(x) WakeByAddressAll((PVOID)x)
-    #define __synchronic_wait_volatile(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
-    #define __synchronic_wait_timed_volatile(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
-    #define __synchronic_wake_one_volatile(x) WakeByAddressSingle((PVOID)x)
-    #define __synchronic_wake_all_volatile(x) WakeByAddressAll((PVOID)x)
-    #define __SYNCHRONIC_COMPATIBLE(x) (std::is_pod<x>::value && (sizeof(x) <= 8))
-
-    inline void native_sleep(unsigned long microseconds)
-    {
-      // What to do if microseconds is < 1000?
-      Sleep(microseconds / 1000);
-    }
-
-    inline void native_yield()
-    {
-      SwitchToThread();
-    }
-#elif defined(__linux__)
-    #include <chrono>
-    #include <time.h>
-    #include <unistd.h>
-    #include <pthread.h>
-    #include <linux/futex.h>
-    #include <sys/syscall.h>
-    #include <climits>
-    #include <cassert>
-    template < class Rep, class Period>
-    inline timespec to_timespec(std::chrono::duration<Rep,Period> const& delta) {
-      struct timespec ts;
-      ts.tv_sec = static_cast<long>(std::chrono::duration_cast<std::chrono::seconds>(delta).count());
-      assert(!ts.tv_sec);
-      ts.tv_nsec = static_cast<long>(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count());
-      return ts;
-    }
-    inline long futex(void const* addr1, int op, int val1) {
-        return syscall(SYS_futex, addr1, op, val1, 0, 0, 0);
-    }
-    inline long futex(void const* addr1, int op, int val1, struct timespec timeout) {
-        return syscall(SYS_futex, addr1, op, val1, &timeout, 0, 0);
-    }
-    inline void native_sleep(unsigned long microseconds)
-    {
-      usleep(microseconds);
-    }
-    inline void native_yield()
-    {
-      pthread_yield();
-    }
-
-    //the combination of SYS_futex(WAIT) and SYS_futex(WAKE) is supported on all recent Linux distributions
-    #define __synchronic_wait(x,v) futex(x, FUTEX_WAIT_PRIVATE, v)
-    #define __synchronic_wait_timed(x,v,t) futex(x, FUTEX_WAIT_PRIVATE, v, to_timespec(t))
-    #define __synchronic_wake_one(x) futex(x, FUTEX_WAKE_PRIVATE, 1)
-    #define __synchronic_wake_all(x) futex(x, FUTEX_WAKE_PRIVATE, INT_MAX)
-    #define __synchronic_wait_volatile(x,v) futex(x, FUTEX_WAIT, v)
-    #define __synchronic_wait_volatile_timed(x,v,t) futex(x, FUTEX_WAIT, v, to_timespec(t))
-    #define __synchronic_wake_one_volatile(x) futex(x, FUTEX_WAKE, 1)
-    #define __synchronic_wake_all_volatile(x) futex(x, FUTEX_WAKE, INT_MAX)
-    #define __SYNCHRONIC_COMPATIBLE(x) (std::is_integral<x>::value && (sizeof(x) <= 4))
-
-    //the yield function on Linux is better replaced by sched_yield, which is tuned for spin-waiting
-    #undef __synchronic_yield
-    #define __synchronic_yield sched_yield
-
-    //for extremely short wait times, just let another hyper-thread run
-    #undef __synchronic_relax
-    #define __synchronic_relax() asm volatile("rep; nop" ::: "memory")
-
-#endif
-#endif
-
-#ifdef _GLIBCXX_USE_NANOSLEEP
-inline void portable_sleep(std::chrono::microseconds const& time)
-{ std::this_thread::sleep_for(time); }
-#else
-inline void portable_sleep(std::chrono::microseconds const& time)
-{ native_sleep(time.count()); }
-#endif
-
-#ifdef _GLIBCXX_USE_SCHED_YIELD
-inline void portable_yield()
-{ std::this_thread::yield(); }
-#else
-inline void portable_yield()
-{ native_yield(); }
-#endif
-
-//this is the number of times we initially spin, on the first wait attempt
-#define __SYNCHRONIC_SPIN_COUNT_A 16
-
-//this is how decide to yield instead of just spinning, 'c' is the current trip count
-//#define __SYNCHRONIC_SPIN_YIELD(c) true
-#define __SYNCHRONIC_SPIN_RELAX(c) (c>>3)
-
-//this is the number of times we normally spin, on every subsequent wait attempt
-#define __SYNCHRONIC_SPIN_COUNT_B 8
-
-}
-}
-
-#endif //__SYNCHRONIC_CONFIG_H
--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp
@ -1,162 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_N3998_HPP
-#define KOKKOS_SYNCHRONIC_N3998_HPP
-
-#include <impl/Kokkos_Synchronic.hpp>
-#include <functional>
-
-/*
-In the section below, a synchronization point represents a point at which a
-thread may block until a given synchronization condition has been reached or
-at which it may notify other threads that a synchronization condition has
-been achieved.
-*/
-namespace Kokkos { namespace Impl {
-
-    /*
-    A latch maintains an internal counter that is initialized when the latch
-    is created. The synchronization condition is reached when the counter is
-    decremented to 0. Threads may block at a synchronization point waiting
-    for the condition to be reached. When the condition is reached, any such
-    blocked threads will be released.
-    */
-    struct latch {
-        latch(int val) : count(val), released(false) { }
-        latch(const latch&) = delete;
-        latch& operator=(const latch&) = delete;
-        ~latch( ) { }
-        void arrive( ) {
-            __arrive( );
-        }
-        void arrive_and_wait( ) {
-            if(!__arrive( ))
-                wait( );
-        }
-        void wait( ) {
-            while(!released.load_when_not_equal(false,std::memory_order_acquire))
-                ;
-        }
-        bool try_wait( ) {
-            return released.load(std::memory_order_acquire);
-        }
-    private:
-        bool __arrive( ) {
-            if(count.fetch_add(-1,std::memory_order_release)!=1)
-                return false;
-            released.store(true,std::memory_order_release);
-            return true;
-        }
-        std::atomic<int> count;
-        synchronic<bool> released;
-    };
-
-    /*
-    A barrier is created with an initial value representing the number of threads
-    that can arrive at the synchronization point. When that many threads have
-    arrived, the  synchronization condition is reached and the threads are
-    released. The barrier will then reset, and may be reused for a new cycle, in
-    which the same set of threads may arrive again at the synchronization point.
-    The same set of threads shall arrive at the barrier in each cycle, otherwise
-    the behaviour is undefined.
-    */
-    struct barrier {
-        barrier(int val) : expected(val), arrived(0), nexpected(val), epoch(0) { }
-        barrier(const barrier&) = delete;
-        barrier& operator=(const barrier&) = delete;
-        ~barrier() { }
-        void arrive_and_wait() {
-            int const myepoch = epoch.load(std::memory_order_relaxed);
-            if(!__arrive(myepoch))
-                while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
-                    ;
-        }
-        void arrive_and_drop() {
-            nexpected.fetch_add(-1,std::memory_order_relaxed);
-            __arrive(epoch.load(std::memory_order_relaxed));
-        }
-    private:
-        bool __arrive(int const myepoch) {
-            int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
-            if(__builtin_expect(myresult == expected,0)) {
-                expected = nexpected.load(std::memory_order_relaxed);
-                arrived.store(0,std::memory_order_relaxed);
-                epoch.store(myepoch+1,std::memory_order_release);
-                return true;
-            }
-            return false;
-        }
-        int expected;
-        std::atomic<int> arrived, nexpected;
-        synchronic<int> epoch;
-    };
-
-    /*
-    A notifying barrier behaves as a barrier, but is constructed with a callable
-    completion function that is invoked after all threads have arrived at the
-    synchronization point, and before the synchronization condition is reached.
-    The completion may modify the set of threads that arrives at the barrier in
-    each cycle.
-    */
-    struct notifying_barrier {
-        template <typename T>
-        notifying_barrier(int val, T && f) : expected(val), arrived(0), nexpected(val), epoch(0), completion(std::forward<T>(f)) { }
-        notifying_barrier(const notifying_barrier&) = delete;
-        notifying_barrier& operator=(const notifying_barrier&) = delete;
-        ~notifying_barrier( ) { }
-        void arrive_and_wait() {
-            int const myepoch = epoch.load(std::memory_order_relaxed);
-            if(!__arrive(myepoch))
-                while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
-                    ;
-        }
-        void arrive_and_drop() {
-            nexpected.fetch_add(-1,std::memory_order_relaxed);
-            __arrive(epoch.load(std::memory_order_relaxed));
-        }
-    private:
-        bool __arrive(int const myepoch) {
-            int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
-            if(__builtin_expect(myresult == expected,0)) {
-                int const newexpected = completion();
-                expected = newexpected ? newexpected : nexpected.load(std::memory_order_relaxed);
-                arrived.store(0,std::memory_order_relaxed);
-                epoch.store(myepoch+1,std::memory_order_release);
-                return true;
-            }
-            return false;
-        }
-        int expected;
-        std::atomic<int> arrived, nexpected;
-        synchronic<int> epoch;
-        std::function<int()> completion;
-    };
-}}
-
-#endif //__N3998_H
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@ -76,9 +76,6 @@ namespace Impl {
 template< typename Space , typename ResultType , typename FunctorType >
 class TaskBase ;

-template< typename Space >
-class TaskExec ;
-
 } /* namespace Impl */
 } /* namespace Kokkos */

@ -149,8 +146,8 @@ private:
  //     task->m_next is the dependence or zero
  //   Postcondition:
  //     task->m_next is linked list membership
-  KOKKOS_FUNCTION
-  void schedule( task_root_type * const );
+  KOKKOS_FUNCTION void schedule_runnable(  task_root_type * const );
+  KOKKOS_FUNCTION void schedule_aggregate( task_root_type * const );

  // Reschedule a task
  //   Precondition:
@ -178,7 +175,7 @@ private:
                       , task_root_type * const );

  KOKKOS_FUNCTION
-  static task_root_type * pop_task( task_root_type * volatile * const );
+  static task_root_type * pop_ready_task( task_root_type * volatile * const );

  KOKKOS_FUNCTION static
  void decrement( task_root_type * task );
@ -368,6 +365,7 @@ public:
  int16_t        m_task_type ;   ///< Type of task
  int16_t        m_priority ;    ///< Priority of runnable task

+  TaskBase() = delete ;
  TaskBase( TaskBase && ) = delete ;
  TaskBase( const TaskBase & ) = delete ;
  TaskBase & operator = ( TaskBase && ) = delete ;
@ -375,17 +373,43 @@ public:

  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;

+  // Constructor for a runnable task
  KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase() noexcept
-    : m_apply(0)
-    , m_queue(0)
-    , m_wait(0)
-    , m_next(0)
-    , m_ref_count(0)
-    , m_alloc_size(0)
-    , m_dep_count(0)
-    , m_task_type( TaskSingle )
-    , m_priority( 1 /* TaskRegularPriority */ )
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , TaskBase    * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    ) noexcept
+    : m_apply(      arg_apply )
+    , m_queue(      arg_queue )
+    , m_wait( 0 )
+    , m_next(       arg_dependence )
+    , m_ref_count(  arg_ref_count )
+    , m_alloc_size( arg_alloc_size )
+    , m_dep_count( 0 )
+    , m_task_type(  arg_task_type )
+    , m_priority(   arg_priority )
+    {}
+
+  // Constructor for an aggregate task
+  KOKKOS_INLINE_FUNCTION
+  constexpr TaskBase( queue_type  * arg_queue
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_dep_count
+                    ) noexcept
+    : m_apply( 0 )
+    , m_queue( arg_queue )
+    , m_wait( 0 )
+    , m_next( 0 )
+    , m_ref_count(  arg_ref_count )
+    , m_alloc_size( arg_alloc_size )
+    , m_dep_count(  arg_dep_count )
+    , m_task_type(  Aggregate )
+    , m_priority( 0 )
    {}

  //----------------------------------------
@ -406,9 +430,13 @@ public:
  KOKKOS_INLINE_FUNCTION
  void add_dependence( TaskBase* dep )
    {
+      // Precondition: lock == m_next
+
+      TaskBase * const lock = (TaskBase *) LockTag ;
+
      // Assign dependence to m_next.  It will be processed in the subsequent
      // call to schedule.  Error if the dependence is reset.
-      if ( 0 != Kokkos::atomic_exchange( & m_next, dep ) ) {
+      if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
      }

@ -431,8 +459,13 @@ class TaskBase< ExecSpace , ResultType , void >
 {
 private:

-  static_assert( sizeof(TaskBase<ExecSpace,void,void>) == 48 , "" );
+  using root_type     = TaskBase<ExecSpace,void,void> ;
+  using function_type = typename root_type::function_type ;
+  using queue_type    = typename root_type::queue_type ;

+  static_assert( sizeof(root_type) == 48 , "" );
+
+  TaskBase() = delete ;
  TaskBase( TaskBase && ) = delete ;
  TaskBase( const TaskBase & ) = delete ;
  TaskBase & operator = ( TaskBase && ) = delete ;
@ -444,9 +477,24 @@ public:

  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;

+  // Constructor for runnable task
  KOKKOS_INLINE_FUNCTION
-  TaskBase()
-    : TaskBase< ExecSpace , void , void >()
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , root_type   * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    )
+    : root_type( arg_apply 
+               , arg_queue
+               , arg_dependence
+               , arg_ref_count
+               , arg_alloc_size
+               , arg_task_type
+               , arg_priority
+               )
    , m_result()
    {}

@ -473,7 +521,10 @@ public:

  using root_type       = TaskBase< ExecSpace , void , void > ;
  using base_type       = TaskBase< ExecSpace , ResultType , void > ;
-  using member_type  = TaskExec< ExecSpace > ;
+  using specialization  = TaskQueueSpecialization< ExecSpace > ;
+  using function_type   = typename root_type::function_type ;
+  using queue_type      = typename root_type::queue_type ;
+  using member_type     = typename specialization::member_type ;
  using functor_type    = FunctorType ;
  using result_type     = ResultType ;

@ -522,13 +573,30 @@ public:
      if ( 0 == member->team_rank() && !(task->requested_respawn()) ) {
        // Did not respawn, destroy the functor to free memory.
        static_cast<functor_type*>(task)->~functor_type();
-        // Cannot destroy the task until its dependences have been processed.
+        // Cannot destroy and deallocate the task until its dependences
+        // have been processed.
      }
    }

+  // Constructor for runnable task
  KOKKOS_INLINE_FUNCTION
-  TaskBase( functor_type const & arg_functor )
-    : base_type()
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , root_type   * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    , FunctorType && arg_functor
+                    )
+    : base_type( arg_apply 
+               , arg_queue
+               , arg_dependence
+               , arg_ref_count
+               , arg_alloc_size
+               , arg_task_type
+               , arg_priority
+               )
    , functor_type( arg_functor )
    {}

--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@ -170,6 +170,7 @@ bool TaskQueue< ExecSpace >::push_task
  )
 {
  // Push task into a concurrently pushed and popped queue.
+  // The queue can be either a ready task queue or a waiting task queue.
  // The queue is a linked list where 'task->m_next' form the links.
  // Fail the push attempt if the queue is locked;
  // otherwise retry until the push succeeds.
@ -227,13 +228,12 @@ bool TaskQueue< ExecSpace >::push_task
 template< typename ExecSpace >
 KOKKOS_FUNCTION
 typename TaskQueue< ExecSpace >::task_root_type *
-TaskQueue< ExecSpace >::pop_task
+TaskQueue< ExecSpace >::pop_ready_task
  ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue )
 {
-  // Pop task from a concurrently pushed and popped queue.
+  // Pop task from a concurrently pushed and popped ready task queue.
  // The queue is a linked list where 'task->m_next' form the links.

-  task_root_type * const zero = (task_root_type *) 0 ;
  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;

@ -252,42 +252,34 @@ TaskQueue< ExecSpace >::pop_task
    // (1) lock, (2) end, or (3) a valid task.
    // Thus zero will never appear in the queue.
    //
-    // If queue is locked then just read by guaranteeing
-    // the CAS will fail.
+    // If queue is locked then just read by guaranteeing the CAS will fail.

    if ( lock == task ) task = 0 ;

    task_root_type * const x = task ;

-    task = Kokkos::atomic_compare_exchange(queue,task,lock);
-
-    if ( x == task ) break ; // CAS succeeded and queue is locked
-  }
-
-  if ( end != task ) {
+    task = Kokkos::atomic_compare_exchange(queue,x,lock);

+    if ( x == task ) {
+      // CAS succeeded and queue is locked
+      //
      // This thread has locked the queue and removed 'task' from the queue.
      // Extract the next entry of the queue from 'task->m_next'
      // and mark 'task' as popped from a queue by setting
      // 'task->m_next = lock'.
-
-    task_root_type * const next =
-      Kokkos::atomic_exchange( & task->m_next , lock );
-
+      //
      // Place the next entry in the head of the queue,
      // which also unlocks the queue.
+      //
+      // This thread has exclusive access to
+      // the queue and the popped task's m_next.

-    task_root_type * const unlock =
-      Kokkos::atomic_exchange( queue , next );
+      *queue = task->m_next ; task->m_next = lock ;

-    if ( next == zero || next == lock || lock != unlock ) {
-      Kokkos::abort("TaskQueue::pop_task ERROR");
-    }
-  }
+      Kokkos::memory_fence();

 #if 0
-  if ( end != task ) {
-    printf( "pop_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
+      printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
            , uintptr_t(queue)
            , uintptr_t(task)
            , uintptr_t(task->m_wait)
@ -295,42 +287,166 @@ TaskQueue< ExecSpace >::pop_task
            , int(task->m_task_type)
            , int(task->m_priority)
            , int(task->m_ref_count) );
-  }
 #endif

      return task ;
+    }
+  }
+
+  return end ;
 }

 //----------------------------------------------------------------------------

 template< typename ExecSpace >
 KOKKOS_FUNCTION
-void TaskQueue< ExecSpace >::schedule
+void TaskQueue< ExecSpace >::schedule_runnable
  ( TaskQueue< ExecSpace >::task_root_type * const task )
 {
-  // Schedule a runnable or when_all task upon construction / spawn
+  // Schedule a runnable task upon construction / spawn
  // and upon completion of other tasks that 'task' is waiting on.
-
-  // Precondition on runnable task state:
-  //   task is either constructing or executing
+  //
+  // Precondition:
+  // - called by a single thread for the input task
+  // - calling thread has exclusive access to the task
+  // - task is not a member of a queue
+  // - if runnable then task is either constructing or respawning
  //
  //   Constructing state:
  //     task->m_wait == 0
-  //     task->m_next == dependence
-  //   Executing-respawn state:
-  //     task->m_wait == head of linked list
-  //     task->m_next == dependence
+  //     task->m_next == dependence or 0
+  //   Respawn state:
+  //     task->m_wait == head of linked list: 'end' or valid task
+  //     task->m_next == dependence or 0
  //
  //  Task state transition:
  //     Constructing ->  Waiting
-  //     Executing-respawn ->  Waiting
+  //     Respawn      ->  Waiting
  //
  //  Postcondition on task state:
-  //     task->m_wait == head of linked list
-  //     task->m_next == member of linked list
+  //     task->m_wait == head of linked list (queue)
+  //     task->m_next == member of linked list (queue)

 #if 0
-  printf( "schedule( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+  printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+  bool respawn = false ;
+
+  //----------------------------------------
+
+  if ( zero == task->m_wait ) {
+    // Task in Constructing state
+    // - Transition to Waiting state
+    // Preconditions:
+    // - call occurs exclusively within a single thread
+
+    task->m_wait = end ;
+    // Task in Waiting state
+  }
+  else if ( lock != task->m_wait ) {
+    // Task in Executing state with Respawn request
+    // - Update dependence
+    // - Transition to Waiting state
+    respawn = true ;
+  }
+  else {
+    // Task in Complete state
+    Kokkos::abort("TaskQueue::schedule_runnable ERROR: task is complete");
+  }
+
+  //----------------------------------------
+  // Scheduling a runnable task which may have a depencency 'dep'.
+  // Extract dependence, if any, from task->m_next.
+  // If 'dep' is not null then attempt to push 'task'
+  // into the wait queue of 'dep'.
+  // If the push succeeds then 'task' may be
+  // processed or executed by another thread at any time.
+  // If the push fails then 'dep' is complete and 'task'
+  // is ready to execute.
+
+  // Exclusive access so don't need an atomic exchange
+  // task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
+  task_root_type * dep = task->m_next ; task->m_next = zero ;
+
+  const bool is_ready = 
+    ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
+
+  if ( ( 0 != dep ) && respawn ) {
+    // Reference count for dep was incremented when
+    // respawn assigned dependency to task->m_next
+    // so that if dep completed prior to the
+    // above push_task dep would not be destroyed.
+    // dep reference count can now be decremented,
+    // which may deallocate the task.
+    TaskQueue::assign( & dep , (task_root_type *)0 );
+  }
+
+  if ( is_ready ) {
+
+    // No dependence or 'dep' is complete so push task into ready queue.
+    // Increment the ready count before pushing into ready queue
+    // to track number of ready + executing tasks.
+    // The ready count will be decremented when the task is complete.
+
+    Kokkos::atomic_increment( & m_ready_count );
+
+    task_root_type * volatile * const ready_queue =
+      & m_ready[ task->m_priority ][ task->m_task_type ];
+
+    // A push_task fails if the ready queue is locked.
+    // A ready queue is only locked during a push or pop;
+    // i.e., it is never permanently locked.
+    // Retry push to ready queue until it succeeds.
+    // When the push succeeds then 'task' may be
+    // processed or executed by another thread at any time.
+
+    while ( ! push_task( ready_queue , task ) );
+  }
+
+  //----------------------------------------
+  // Postcondition:
+  // - A runnable 'task' was pushed into a wait or ready queue.
+  // - Concurrent execution may have already popped 'task'
+  //   from a queue and processed it as appropriate.
+}
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule_aggregate
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+  // Schedule an aggregate task upon construction
+  // and upon completion of other tasks that 'task' is waiting on.
+  //
+  // Precondition:
+  // - called by a single thread for the input task
+  // - calling thread has exclusive access to the task
+  // - task is not a member of a queue
+  //
+  //   Constructing state:
+  //     task->m_wait == 0
+  //     task->m_next == dependence or 0
+  //
+  //  Task state transition:
+  //     Constructing ->  Waiting
+  //
+  //  Postcondition on task state:
+  //     task->m_wait == head of linked list (queue)
+  //     task->m_next == member of linked list (queue)
+
+#if 0
+  printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
        , uintptr_t(task)
        , uintptr_t(task->m_wait)
        , uintptr_t(task->m_next)
@ -344,71 +460,22 @@ void TaskQueue< ExecSpace >::schedule
  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;

  //----------------------------------------
-  {
-    // If Constructing then task->m_wait == 0
-    // Change to waiting by task->m_wait = EndTag

-    task_root_type * const init =
-      Kokkos::atomic_compare_exchange( & task->m_wait , zero , end );
+  if ( zero == task->m_wait ) {
+    // Task in Constructing state
+    // - Transition to Waiting state
+    // Preconditions:
+    // - call occurs exclusively within a single thread

-    // Precondition
-
-    if ( lock == init ) {
-      Kokkos::abort("TaskQueue::schedule ERROR: task is complete");
+    task->m_wait = end ;
+    // Task in Waiting state
+  }
+  else if ( lock == task->m_wait ) {
+    // Task in Complete state
+    Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
  }

-    // if ( init == 0 ) Constructing       ->  Waiting
-    // else             Executing-Respawn  ->  Waiting
-  }
  //----------------------------------------
-
-  if ( task_root_type::Aggregate != task->m_task_type ) {
-
-    // Scheduling a runnable task which may have a depencency 'dep'.
-    // Extract dependence, if any, from task->m_next.
-    // If 'dep' is not null then attempt to push 'task'
-    // into the wait queue of 'dep'.
-    // If the push succeeds then 'task' may be
-    // processed or executed by another thread at any time.
-    // If the push fails then 'dep' is complete and 'task'
-    // is ready to execute.
-
-    task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
-
-    const bool is_ready =
-      ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
-
-    // Reference count for dep was incremented when assigned
-    // to task->m_next so that if it completed prior to the
-    // above push_task dep would not be destroyed.
-    // dep reference count can now be decremented,
-    // which may deallocate the task.
-    TaskQueue::assign( & dep , (task_root_type *)0 );
-
-    if ( is_ready ) {
-
-      // No dependence or 'dep' is complete so push task into ready queue.
-      // Increment the ready count before pushing into ready queue
-      // to track number of ready + executing tasks.
-      // The ready count will be decremented when the task is complete.
-
-      Kokkos::atomic_increment( & m_ready_count );
-
-      task_root_type * volatile * const queue =
-        & m_ready[ task->m_priority ][ task->m_task_type ];
-
-      // A push_task fails if the ready queue is locked.
-      // A ready queue is only locked during a push or pop;
-      // i.e., it is never permanently locked.
-      // Retry push to ready queue until it succeeds.
-      // When the push succeeds then 'task' may be
-      // processed or executed by another thread at any time.
-
-      while ( ! push_task( queue , task ) );
-    }
-  }
-  //----------------------------------------
-  else {
  // Scheduling a 'when_all' task with multiple dependences.
  // This scheduling may be called when the 'when_all' is
  // (1) created or
@ -432,7 +499,9 @@ void TaskQueue< ExecSpace >::schedule
    // The reference count of 'x' was incremented when
    // it was assigned into the dependence list.

-      task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
+    // Exclusive access so don't need an atomic exchange
+    // task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
+    task_root_type * x = aggr[i] ; aggr[i] = zero ;

    if ( x ) {

@ -464,13 +533,11 @@ void TaskQueue< ExecSpace >::schedule

    // '*task' may have been deleted upon completion
  }
-  }
+
  //----------------------------------------
  // Postcondition:
-  //   A runnable 'task' was pushed into a wait or ready queue.
-  //   An aggregate 'task' was either pushed to a wait queue
-  //   or completed.
-  // Concurrent execution may have already popped 'task'
+  // - An aggregate 'task' was either pushed to a wait queue or completed.
+  // - Concurrent execution may have already popped 'task'
  //   from a queue and processed it as appropriate.
 }

@ -529,7 +596,7 @@ void TaskQueue< ExecSpace >::complete
    // Is a runnable task has finished executing and requested respawn.
    // Schedule the task for subsequent execution.

-    schedule( task );
+    schedule_runnable( task );
  }
  //----------------------------------------
  else {
@ -556,18 +623,22 @@ void TaskQueue< ExecSpace >::complete
      TaskQueue::assign( & task , zero );

      // This thread has exclusive access to the wait list so
-      // the concurrency-safe pop_task function is not needed.
+      // the concurrency-safe pop_ready_task function is not needed.
      // Schedule the tasks that have been waiting on the input 'task',
      // which may have been deleted.

      while ( x != end ) {
+        // Have exclusive access to 'x' until it is scheduled
+        // Set x->m_next = zero  <=  no dependence, not a respawn

-        // Set x->m_next = zero  <=  no dependence
+        task_root_type * const next = x->m_next ; x->m_next = 0 ;

-        task_root_type * const next =
-          (task_root_type *) Kokkos::atomic_exchange( & x->m_next , zero );
-
-        schedule( x );
+        if ( task_root_type::Aggregate != x->m_task_type ) {
+          schedule_runnable( x );
+        }
+        else {
+          schedule_aggregate( x );
+        }

        x = next ;
      }
--- a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
@ -45,6 +45,7 @@
 #define KOKKOS_CORE_IMPL_UTILITIES_HPP

 #include <Kokkos_Macros.hpp>
+#include <stdint.h>
 #include <type_traits>

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
@ -42,46 +42,138 @@
 */

 #include <Kokkos_Macros.hpp>
+
 #include <impl/Kokkos_spinwait.hpp>

+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+
 /*--------------------------------------------------------------------------*/

-#if ( KOKKOS_ENABLE_ASM )
+#if !defined( _WIN32 )
+  #if defined( KOKKOS_ENABLE_ASM )
    #if defined( __arm__ ) || defined( __aarch64__ )
      /* No-operation instruction to idle the thread. */
-    #define YIELD   asm volatile("nop")
+      #define KOKKOS_INTERNAL_PAUSE
    #else
      /* Pause instruction to prevent excess processor bus usage */
-    #define YIELD   asm volatile("pause\n":::"memory")
+      #define KOKKOS_INTERNAL_PAUSE   asm volatile("pause\n":::"memory")
    #endif
-#elif defined ( KOKKOS_ENABLE_WINTHREAD )
+    #define KOKKOS_INTERNAL_NOP2    asm volatile("nop\n" "nop\n")
+    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
+    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
+    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
+    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
+    namespace {
+    inline void kokkos_internal_yield( const unsigned i ) noexcept {
+      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
+      case 0u:  KOKKOS_INTERNAL_NOP2;  break;
+      case 1u:  KOKKOS_INTERNAL_NOP4;  break;
+      case 2u:  KOKKOS_INTERNAL_NOP8;  break;
+      case 3u:  KOKKOS_INTERNAL_NOP16; break;
+      default: KOKKOS_INTERNAL_NOP32;
+      }
+      KOKKOS_INTERNAL_PAUSE;
+    }
+    }
+  #else
+    #include <sched.h>
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      sched_yield();
+    }
+    }
+  #endif
+#else // defined( _WIN32 )
+  #if defined ( KOKKOS_ENABLE_WINTHREAD )
    #include <process.h>
-  #define YIELD  Sleep(0)
-#elif defined ( _WIN32)  && defined (_MSC_VER)
-  /* Windows w/ Visual Studio */
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      Sleep(0);
+    }
+    }
+  #elif defined( _MSC_VER )
    #define NOMINMAX
    #include <winsock2.h>
    #include <windows.h>
-#define YIELD YieldProcessor();
-#elif defined ( _WIN32 )
-  /* Windows w/ Intel*/
-  #define YIELD __asm__ __volatile__("pause\n":::"memory")
-#else
-  #include <sched.h>
-  #define YIELD  sched_yield()
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      YieldProcessor();
+    }
+    }
+  #else
+    #define KOKKOS_INTERNAL_PAUSE   __asm__ __volatile__("pause\n":::"memory")
+    #define KOKKOS_INTERNAL_NOP2    __asm__ __volatile__("nop\n" "nop")
+    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
+    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
+    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
+    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
+    namespace {
+    inline void kokkos_internal_yield( const unsigned i ) noexcept {
+      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
+      case 0:  KOKKOS_INTERNAL_NOP2;  break;
+      case 1:  KOKKOS_INTERNAL_NOP4;  break;
+      case 2:  KOKKOS_INTERNAL_NOP8;  break;
+      case 3:  KOKKOS_INTERNAL_NOP16; break;
+      default: KOKKOS_INTERNAL_NOP32;
+      }
+      KOKKOS_INTERNAL_PAUSE;
+    }
+    }
+  #endif
 #endif

+
 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
 namespace Impl {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void spinwait( volatile int & flag , const int value )
+
+void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
 {
+  Kokkos::store_fence();
+  unsigned i = 0;
  while ( value == flag ) {
-    YIELD ;
+    kokkos_internal_yield(i);
+    ++i;
  }
+  Kokkos::load_fence();
 }
+
+void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value != flag ) {
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
+void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value == flag ) {
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
+void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value != flag ) {
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
 #endif

 } /* namespace Impl */
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
@ -47,14 +47,30 @@

 #include <Kokkos_Macros.hpp>

+#include <cstdint>
+
 namespace Kokkos {
 namespace Impl {

 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void spinwait( volatile int & flag , const int value );
+
+void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
+void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
+
+void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
+void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
 #else
+
 KOKKOS_INLINE_FUNCTION
-void spinwait( volatile int & , const int ) {}
+void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
+KOKKOS_INLINE_FUNCTION
+void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
+
+KOKKOS_INLINE_FUNCTION
+void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
+KOKKOS_INLINE_FUNCTION
+void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
+
 #endif

 } /* namespace Impl */
--- a/lib/kokkos/core/unit_test/CMakeLists.txt
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@ -115,10 +115,31 @@ IF(Kokkos_ENABLE_OpenMP)
  )
 ENDIF()

-IF(Kokkos_ENABLE_QTHREAD)
+IF(Kokkos_ENABLE_Qthreads)
  TRIBITS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_Qthread
-    SOURCES UnitTestMain.cpp TestQthread.cpp
+    UnitTest_Qthreads
+    SOURCES
+      UnitTestMain.cpp
+      qthreads/TestQthreads_Atomics.cpp
+      qthreads/TestQthreads_Other.cpp
+      qthreads/TestQthreads_Reductions.cpp
+      qthreads/TestQthreads_SubView_a.cpp
+      qthreads/TestQthreads_SubView_b.cpp
+      qthreads/TestQthreads_SubView_c01.cpp
+      qthreads/TestQthreads_SubView_c02.cpp
+      qthreads/TestQthreads_SubView_c03.cpp
+      qthreads/TestQthreads_SubView_c04.cpp
+      qthreads/TestQthreads_SubView_c05.cpp
+      qthreads/TestQthreads_SubView_c06.cpp
+      qthreads/TestQthreads_SubView_c07.cpp
+      qthreads/TestQthreads_SubView_c08.cpp
+      qthreads/TestQthreads_SubView_c09.cpp
+      qthreads/TestQthreads_SubView_c10.cpp
+      qthreads/TestQthreads_SubView_c11.cpp
+      qthreads/TestQthreads_SubView_c12.cpp
+      qthreads/TestQthreads_Team.cpp
+      qthreads/TestQthreads_ViewAPI_a.cpp
+      qthreads/TestQthreads_ViewAPI_b.cpp
    COMM serial mpi
    NUM_MPI_PROCS 1
    FAIL_REGULAR_EXPRESSION "  FAILED  "
@ -194,4 +215,3 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
  FAIL_REGULAR_EXPRESSION "  FAILED  "
    TESTONLYLIBS kokkos_gtest
 )
-
--- a/lib/kokkos/core/unit_test/Makefile
+++ b/lib/kokkos/core/unit_test/Makefile
@ -6,6 +6,7 @@ vpath %.cpp ${KOKKOS_PATH}/core/unit_test
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/serial
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/threads
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/openmp
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/qthreads
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/cuda

 TEST_HEADERS = $(wildcard $(KOKKOS_PATH)/core/unit_test/*.hpp)
@ -78,6 +79,22 @@ endif
 	TEST_TARGETS += test-openmp
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+	OBJ_QTHREADS = TestQthreads_Other.o TestQthreads_Reductions.o TestQthreads_Atomics.o TestQthreads_Team.o
+	OBJ_QTHREADS += TestQthreads_SubView_a.o TestQthreads_SubView_b.o
+ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+	OBJ_QTHREADS += TestQthreads_SubView_c_all.o
+else
+	OBJ_QTHREADS += TestQthreads_SubView_c01.o TestQthreads_SubView_c02.o TestQthreads_SubView_c03.o
+	OBJ_QTHREADS += TestQthreads_SubView_c04.o TestQthreads_SubView_c05.o TestQthreads_SubView_c06.o
+	OBJ_QTHREADS += TestQthreads_SubView_c07.o TestQthreads_SubView_c08.o TestQthreads_SubView_c09.o
+	OBJ_QTHREADS += TestQthreads_SubView_c10.o TestQthreads_SubView_c11.o TestQthreads_SubView_c12.o
+endif
+	OBJ_QTHREADS += TestQthreads_ViewAPI_a.o TestQthreads_ViewAPI_b.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Qthreads
+	TEST_TARGETS += test-qthreads
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
 	OBJ_SERIAL = TestSerial_Other.o TestSerial_Reductions.o TestSerial_Atomics.o TestSerial_Team.o
 	OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o
@ -94,12 +111,6 @@ endif
 	TEST_TARGETS += test-serial
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	OBJ_QTHREAD = TestQthread.o UnitTestMain.o gtest-all.o
-	TARGETS += KokkosCore_UnitTest_Qthread
-	TEST_TARGETS += test-qthread
-endif
-
 OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o
 TARGETS += KokkosCore_UnitTest_HWLOC
 TEST_TARGETS += test-hwloc
@ -115,10 +126,6 @@ TARGETS += ${INITTESTS_TARGETS}
 INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS})
 TEST_TARGETS += ${INITTESTS_TEST_TARGETS}

-OBJ_SYNCHRONIC = TestSynchronic.o UnitTestMain.o gtest-all.o
-TARGETS += KokkosCore_UnitTest_Synchronic
-TEST_TARGETS += test-synchronic
-
 KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Cuda

@ -131,8 +138,8 @@ KokkosCore_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
 KokkosCore_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Serial

-KokkosCore_UnitTest_Qthread: $(OBJ_QTHREAD) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREAD) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthread
+KokkosCore_UnitTest_Qthreads: $(OBJ_QTHREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthreads

 KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_HWLOC
@ -146,9 +153,6 @@ KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
 ${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$*

-KokkosCore_UnitTest_Synchronic: $(OBJ_SYNCHRONIC) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SYNCHRONIC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Synchronic
-
 test-cuda: KokkosCore_UnitTest_Cuda
 	./KokkosCore_UnitTest_Cuda

@ -161,8 +165,8 @@ test-openmp: KokkosCore_UnitTest_OpenMP
 test-serial: KokkosCore_UnitTest_Serial
 	./KokkosCore_UnitTest_Serial

-test-qthread: KokkosCore_UnitTest_Qthread
-	./KokkosCore_UnitTest_Qthread
+test-qthreads: KokkosCore_UnitTest_Qthreads
+	./KokkosCore_UnitTest_Qthreads

 test-hwloc: KokkosCore_UnitTest_HWLOC
 	./KokkosCore_UnitTest_HWLOC
@ -176,9 +180,6 @@ test-default: KokkosCore_UnitTest_Default
 ${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_%
 	./KokkosCore_UnitTest_DefaultDeviceTypeInit_$*

-test-synchronic: KokkosCore_UnitTest_Synchronic
-	./KokkosCore_UnitTest_Synchronic
-
 build_all: $(TARGETS)

 test: $(TEST_TARGETS)
@ -193,4 +194,3 @@ clean: kokkos-clean

 gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
-
--- a/Show More
+++ b/Show More