Update Kokkos to v2.04.11

2017-11-06 13:47:33 -07:00
parent 39df9f5d94
commit 16b5315845
117 changed files with 33239 additions and 1093 deletions
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,4 +1,34 @@
 # Change Log
+## [2.04.11](https://github.com/kokkos/kokkos/tree/2.04.11) (2017-10-28)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.04...2.04.11)
+
+**Implemented enhancements:**
+
+- Add Subview pattern. [\#648](https://github.com/kokkos/kokkos/issues/648)
+- Add Kokkos "global" is\_initialized [\#1060](https://github.com/kokkos/kokkos/issues/1060)
+- Add create\_mirror\_view\_and\_copy [\#1161](https://github.com/kokkos/kokkos/issues/1161)
+- Add KokkosConcepts SpaceAccessibility function [\#1092](https://github.com/kokkos/kokkos/issues/1092)
+- Option to Disable Initialize Warnings [\#1142](https://github.com/kokkos/kokkos/issues/1142)
+- Mature task-DAG capability [\#320](https://github.com/kokkos/kokkos/issues/320)
+- Promote Work DAG from experimental [\#1126](https://github.com/kokkos/kokkos/issues/1126)
+- Implement new WorkGraph push/pop [\#1108](https://github.com/kokkos/kokkos/issues/1108)
+- Kokkos\_ENABLE\_Cuda\_Lambda should default ON [\#1101](https://github.com/kokkos/kokkos/issues/1101)
+- Add multidimensional parallel for example and improve unit test [\#1064](https://github.com/kokkos/kokkos/issues/1064)
+- Fix ROCm:  Performance tests not building [\#1038](https://github.com/kokkos/kokkos/issues/1038)
+- Make KOKKOS\_ALIGN\_SIZE a configure-time option [\#1004](https://github.com/kokkos/kokkos/issues/1004)
+- Make alignment consistent [\#809](https://github.com/kokkos/kokkos/issues/809)
+- Improve subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615)
+
+**Fixed bugs:**
+
+- Kokkos::vector fixes for application [\#1134](https://github.com/kokkos/kokkos/issues/1134)
+- DynamicView non-power of two value\_type [\#1177](https://github.com/kokkos/kokkos/issues/1177)
+- Memory pool bug [\#1154](https://github.com/kokkos/kokkos/issues/1154)
+- Cuda launch bounds performance regression bug [\#1140](https://github.com/kokkos/kokkos/issues/1140)
+- Significant performance regression in LAMMPS after updating Kokkos [\#1139](https://github.com/kokkos/kokkos/issues/1139)
+- CUDA compile error [\#1128](https://github.com/kokkos/kokkos/issues/1128)
+- MDRangePolicy neg idx test failure in debug mode [\#1113](https://github.com/kokkos/kokkos/issues/1113)
+- subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615)

 ## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -33,6 +33,24 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
 # HAVE_KOKKOS_DEBUG.  We define KOKKOS_HAVE_DEBUG here instead,
 # for compatibility with Kokkos' Makefile build system.

+if (TPL_ENABLE_CUDA)
+  if (DEFINED CUDA_VERSION)
+    # there is a VERSION_GREATER_EQUAL, but only in CMake >= 3.7
+    if (CUDA_VERSION VERSION_EQUAL "7.5")
+      set(KOKKOS_HAVE_CUDA_GEQ_75 TRUE)
+    endif()
+    if (CUDA_VERSION VERSION_GREATER "7.5")
+      set(KOKKOS_HAVE_CUDA_GEQ_75 TRUE)
+    endif()
+    if (CUDA_VERSION VERSION_EQUAL "8.0")
+      set(KOKKOS_HAVE_CUDA_GEQ_80 TRUE)
+    endif()
+    if (CUDA_VERSION VERSION_GREATER "8.0")
+      set(KOKKOS_HAVE_CUDA_GEQ_80 TRUE)
+    endif()
+  endif()
+endif()
+
 TRIBITS_ADD_OPTION_AND_DEFINE(
  Kokkos_ENABLE_DEBUG
  KOKKOS_HAVE_DEBUG
@ -51,7 +69,7 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
  Kokkos_ENABLE_Cuda
  KOKKOS_HAVE_CUDA
  "Enable CUDA support in Kokkos."
-  "${TPL_ENABLE_CUDA}"
+  "${KOKKOS_HAVE_CUDA_TPL}"
  )

 TRIBITS_ADD_OPTION_AND_DEFINE(
@ -68,11 +86,27 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
  OFF
  )

+set(Kokkos_ENABLE_Cuda_Lambda_DEFAULT OFF)
+if (Kokkos_ENABLE_Cuda)
+  if (KOKKOS_HAVE_CUDA_GEQ_75)
+    if (CMAKE_CXX_FLAGS MATCHES "-expt-extended-lambda")
+      set(Kokkos_ENABLE_Cuda_Lambda_DEFAULT ON)
+      message("-- CUDA version is >= 7.5 and CMAKE_CXX_FLAGS contains -expt-extended-lambda,")
+      message("--   Kokkos_ENABLE_Cuda_Lambda defaults to ON")
+    else()
+      message("-- CMAKE_CXX_FLAGS doesn't contain -expt-extended-lambda,")
+      message("--   Kokkos_ENABLE_Cuda_Lambda defaults to OFF")
+    endif()
+  else()
+    message("-- CUDA version is < 7.5, Kokkos_ENABLE_Cuda_Lambda defaults to OFF")
+  endif()
+endif()
+
 TRIBITS_ADD_OPTION_AND_DEFINE(
  Kokkos_ENABLE_Cuda_Lambda
  KOKKOS_HAVE_CUDA_LAMBDA
  "Enable CUDA LAMBDA support in Kokkos."
-  OFF
+  "${Kokkos_ENABLE_Cuda_Lambda_DEFAULT}"
  )

 TRIBITS_ADD_OPTION_AND_DEFINE(
--- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@ -211,12 +211,15 @@ void test_dynamic_view_sort(unsigned int n )

  const size_t upper_bound = 2 * n ;

+  const size_t total_alloc_size = n * sizeof(KeyType) * 1.2 ;
+  const size_t superblock_size  = std::min(total_alloc_size, size_t(1000000));
+
  typename KeyDynamicViewType::memory_pool
    pool( memory_space()
        , n * sizeof(KeyType) * 1.2
        ,     500 /* min block size in bytes */
        ,   30000 /* max block size in bytes */
-        , 1000000 /* min superblock size in bytes */
+        , superblock_size
        );

  KeyDynamicViewType keys("Keys",pool,upper_bound);
@ -271,8 +274,10 @@ void test_sort(unsigned int N)
 {
  test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true);
  test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false);
+#if !defined(KOKKOS_ENABLE_ROCM)
  test_3D_sort<ExecutionSpace,KeyType>(N);
  test_dynamic_view_sort<ExecutionSpace,KeyType>(N*N);
+#endif
 }

 }
--- a/lib/kokkos/benchmarks/atomic/Makefile
+++ b/lib/kokkos/benchmarks/atomic/Makefile
@ -0,0 +1,44 @@
+KOKKOS_PATH = ${HOME}/kokkos
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+EXE_NAME = "test"
+
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/config/nvcc_wrapper
+EXE = ${EXE_NAME}.cuda
+KOKKOS_CUDA_OPTIONS = "enable_lambda"
+else
+CXX = g++
+EXE = ${EXE_NAME}.host
+endif
+
+CXXFLAGS = -O3
+
+LINK = ${CXX}
+LINKFLAGS = -O3
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
--- a/lib/kokkos/benchmarks/atomic/main.cpp
+++ b/lib/kokkos/benchmarks/atomic/main.cpp
@ -0,0 +1,124 @@
+#include<Kokkos_Core.hpp>
+#include<impl/Kokkos_Timer.hpp>
+#include<Kokkos_Random.hpp>
+
+template<class Scalar>
+double test_atomic(int L, int N, int M,int K,int R,Kokkos::View<const int*> offsets) {
+  Kokkos::View<Scalar*> output("Output",N);
+  Kokkos::Impl::Timer timer;
+
+  for(int r = 0; r<R; r++)
+  Kokkos::parallel_for(L, KOKKOS_LAMBDA (const int&i) {
+    Scalar s = 2;
+    for(int m=0;m<M;m++) {
+      for(int k=0;k<K;k++)
+        s=s*s+s;
+      const int idx = (i+offsets(i,m))%N;
+      Kokkos::atomic_add(&output(idx),s);
+    }
+  }); 
+  Kokkos::fence();
+  double time = timer.seconds();
+ 
+  return time;
+}
+
+template<class Scalar>
+double test_no_atomic(int L, int N, int M,int K,int R,Kokkos::View<const int*> offsets) {
+  Kokkos::View<Scalar*> output("Output",N);
+  Kokkos::Impl::Timer timer;
+  for(int r = 0; r<R; r++)
+  Kokkos::parallel_for(L, KOKKOS_LAMBDA (const int&i) {
+    Scalar s = 2;
+    for(int m=0;m<M;m++) {
+      for(int k=0;k<K;k++)
+        s=s*s+s;
+      const int idx = (i+offsets(i,m))%N;
+      output(idx) += s;
+    }
+  });
+  Kokkos::fence();
+  double time =  timer.seconds();
+  return time;
+}
+
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc,argv);
+{
+  if(argc<8) {
+    printf("Arguments: L N M D K R T\n");
+    printf("  L:   Number of iterations to run\n");
+    printf("  N:   Length of array to do atomics into\n");
+    printf("  M:   Number of atomics per iteration to do\n");
+    printf("  D:   Distance from index i to do atomics into (randomly)\n");
+    printf("  K:   Number of FMAD per atomic\n");
+    printf("  R:   Number of repeats of the experiments\n");
+    printf("  T:   Type of atomic\n");
+    printf("       1 - int\n");
+    printf("       2 - long\n");
+    printf("       3 - float\n");
+    printf("       4 - double\n");
+    printf("       5 - complex<double>\n");
+    printf("Example Input GPU:\n");
+    printf("  Histogram : 1000000 1000 1 1000 1 10 1\n");
+    printf("  MD Force : 100000 100000 100 1000 20 10 4\n");
+    printf("  Matrix Assembly : 100000 1000000 50 1000 20 10 4\n");
+    Kokkos::finalize();
+    return 0;
+  }
+
+
+  int L = atoi(argv[1]);
+  int N = atoi(argv[2]);
+  int M = atoi(argv[3]);
+  int D = atoi(argv[4]); 
+  int K = atoi(argv[5]);
+  int R = atoi(argv[6]); 
+  int type = atoi(argv[7]);
+ 
+  Kokkos::View<int*> offsets("Offsets",L,M);
+  Kokkos::Random_XorShift64_Pool<> pool(12371);
+  Kokkos::fill_random(offsets,pool,D);
+  double time = 0;
+  if(type==1)
+    time  = test_atomic<int>(L,N,M,K,R,offsets);
+  if(type==2)
+    time = test_atomic<long>(L,N,M,K,R,offsets);
+  if(type==3)
+    time = test_atomic<float>(L,N,M,K,R,offsets);
+  if(type==4)
+    time = test_atomic<double>(L,N,M,K,R,offsets);
+  if(type==5)
+    time = test_atomic<Kokkos::complex<double> >(L,N,M,K,R,offsets);
+
+  double time2 = 1;
+  if(type==1)
+    time2 = test_no_atomic<int>(L,N,M,K,R,offsets);
+  if(type==2)
+    time2 = test_no_atomic<long>(L,N,M,K,R,offsets);
+  if(type==3)
+    time2 = test_no_atomic<float>(L,N,M,K,R,offsets);
+  if(type==4)
+    time2 = test_no_atomic<double>(L,N,M,K,R,offsets);
+  if(type==5)
+    time2 = test_no_atomic<Kokkos::complex<double> >(L,N,M,K,R,offsets);
+
+  int size = 0;
+  if(type==1) size = sizeof(int);
+  if(type==2) size = sizeof(long);
+  if(type==3) size = sizeof(float);
+  if(type==4) size = sizeof(double);
+  if(type==5) size = sizeof(Kokkos::complex<double>);
+
+  printf("%i\n",size);
+  printf("Time: %s %i %i %i %i %i %i (t_atomic: %e t_nonatomic: %e ratio: %lf )( GUpdates/s: %lf GB/s: %lf )\n",
+    (type==1)?"int": (
+    (type==2)?"long": (
+    (type==3)?"float": (
+    (type==4)?"double":"complex"))),
+    L,N,M,D,K,R,time,time2,time/time2,
+    1.e-9*L*R*M/time, 1.0*L*R*M*2*size/time/1024/1024/1024);
+}
+  Kokkos::finalize();
+}
+
--- a/lib/kokkos/bin/hpcbind
+++ b/lib/kokkos/bin/hpcbind
@ -303,9 +303,9 @@ if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
  HPCBIND_ERR=/dev/null
  HPCBIND_OUT=/dev/null
 else
-  HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log"
-  HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err"
-  HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out"
+  HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_QUEUE_INDEX}"
+  HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_QUEUE_INDEX}"
+  HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_QUEUE_INDEX}"
  > ${HPCBIND_LOG}
 fi

--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@ -10,3 +10,4 @@ tag:  2.03.05    date: 05:27:2017    master: 36b92f43    develop: 79073186
 tag:  2.03.13    date: 07:27:2017    master: da314444    develop: 29ccb58a
 tag:  2.04.00    date: 08:16:2017    master: 54eb75c0    develop: 32fb8ee1
 tag:  2.04.04    date: 09:11:2017    master: 2b7e9c20    develop: 51e7b25a
+tag:  2.04.11    date: 10:28:2017    master: 54a1330a    develop: ed36c017
--- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel
+++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel
@ -1,6 +1,6 @@
 #!/bin/bash -el
 ulimit -c 0
-module load devpack/openmpi/1.10.0/intel/16.1.056/cuda/none 
+module load devpack/openmpi/2.1.1/intel/17.4.196/cuda/none

 KOKKOS_BRANCH=$1
 TRILINOS_UPDATE_BRANCH=$2
--- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel
+++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel
@ -1,6 +1,6 @@
 #!/bin/bash -el
 ulimit -c 0
-module load devpack/openmpi/1.10.0/intel/16.1.056/cuda/none 
+module load devpack/openmpi/2.1.1/intel/17.4.196/cuda/none

 KOKKOS_BRANCH=$1
 TRILINOS_UPDATE_BRANCH=$2
--- a/lib/kokkos/containers/performance_tests/Makefile
+++ b/lib/kokkos/containers/performance_tests/Makefile
@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	TEST_TARGETS += test-cuda
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+	OBJ_ROCM = TestROCm.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_ROCm
+	TEST_TARGETS += test-rocm
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 	OBJ_THREADS = TestThreads.o TestMain.o gtest-all.o
 	TARGETS += KokkosContainers_PerformanceTest_Threads
@ -45,6 +51,9 @@ endif
 KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda

+KokkosContainers_PerformanceTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_ROCm
+
 KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads

@ -54,6 +63,9 @@ KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
 test-cuda: KokkosContainers_PerformanceTest_Cuda
 	./KokkosContainers_PerformanceTest_Cuda

+test-rocm: KokkosContainers_PerformanceTest_ROCm
+	./KokkosContainers_PerformanceTest_ROCm
+
 test-threads: KokkosContainers_PerformanceTest_Threads
 	./KokkosContainers_PerformanceTest_Threads

--- a/lib/kokkos/containers/performance_tests/TestROCm.cpp
+++ b/lib/kokkos/containers/performance_tests/TestROCm.cpp
@ -0,0 +1,113 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_ROCM )
+
+#include <cstdint>
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <fstream>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestDynRankView.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <TestGlobal2LocalIds.hpp>
+
+#include <TestUnorderedMapPerformance.hpp>
+
+namespace Performance {
+
+class rocm : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Experimental::ROCm::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+#if 0
+// issue 1089
+TEST_F( rocm, dynrankview_perf )
+{
+  std::cout << "ROCm" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Experimental::ROCm>( 40960 );
+}
+
+TEST_F( rocm, global_2_local)
+{
+  std::cout << "ROCm" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::Experimental::ROCm>(i);
+}
+
+#endif
+TEST_F( rocm, unordered_map_performance_near)
+{
+  Perf::run_performance_tests<Kokkos::Experimental::ROCm,true>("rocm-near");
+}
+
+TEST_F( rocm, unordered_map_performance_far)
+{
+  Perf::run_performance_tests<Kokkos::Experimental::ROCm,false>("rocm-far");
+}
+
+}
+#else
+void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTROCM_PREVENT_EMPTY_LINK_ERROR() {}
+#endif  /* #if defined( KOKKOS_ENABLE_ROCM ) */
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@ -72,8 +72,10 @@ private:
               , "DynamicView must be rank-one" );

  static_assert( std::is_trivial< typename traits::value_type >::value &&
-                 std::is_same< typename traits::specialize , void >::value
-               , "DynamicView must have trivial data type" );
+                 std::is_same< typename traits::specialize , void >::value &&
+                 Kokkos::Impl::is_power_of_two
+                   <sizeof(typename traits::value_type)>::value
+               , "DynamicView must have trivial value_type and sizeof(value_type) is a power-of-two");


  template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space
--- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@ -517,7 +517,7 @@ public:

    size_type find_attempts = 0;

-    enum { bounded_find_attempts = 32u };
+    enum : unsigned { bounded_find_attempts = 32u };
    const size_type max_attempts = (m_bounded_insert && (bounded_find_attempts < m_available_indexes.max_hint()) ) ?
                                    bounded_find_attempts :
                                    m_available_indexes.max_hint();
--- a/lib/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp
@ -59,8 +59,8 @@ class vector : public DualView<Scalar*,LayoutLeft,Arg1Type> {
  typedef Scalar value_type;
  typedef Scalar* pointer;
  typedef const Scalar* const_pointer;
-  typedef Scalar* reference;
-  typedef const Scalar* const_reference;
+  typedef Scalar& reference;
+  typedef const Scalar& const_reference;
  typedef Scalar* iterator;
  typedef const Scalar* const_iterator;

@ -73,11 +73,11 @@ private:

 public:
 #ifdef KOKKOS_ENABLE_CUDA_UVM
-  KOKKOS_INLINE_FUNCTION Scalar& operator() (int i) const {return DV::h_view(i);};
-  KOKKOS_INLINE_FUNCTION Scalar& operator[] (int i) const {return DV::h_view(i);};
+  KOKKOS_INLINE_FUNCTION reference operator() (int i) const {return DV::h_view(i);};
+  KOKKOS_INLINE_FUNCTION reference operator[] (int i) const {return DV::h_view(i);};
 #else
-  inline Scalar& operator() (int i) const {return DV::h_view(i);};
-  inline Scalar& operator[] (int i) const {return DV::h_view(i);};
+  inline reference operator() (int i) const {return DV::h_view(i);};
+  inline reference operator[] (int i) const {return DV::h_view(i);};
 #endif

  /* Member functions which behave like std::vector functions */
@ -86,7 +86,7 @@ public:
    _size = 0;
    _extra_storage = 1.1;
    DV::modified_host() = 1;
-  };
+  }


  vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Arg1Type>("Vector",size_t(n*(1.1))) {
@ -146,25 +146,32 @@ public:
    DV::h_view(_size) = val;
    _size++;

-  };
+  }

  void pop_back() {
    _size--;
-  };
+  }

  void clear() {
    _size = 0;
  }

-  size_type size() const {return _size;};
+  size_type size() const {return _size;}
  size_type max_size() const {return 2000000000;}
-  size_type capacity() const {return DV::capacity();};
-  bool empty() const {return _size==0;};
+  size_type capacity() const {return DV::capacity();}
+  bool empty() const {return _size==0;}

-  iterator begin() const {return &DV::h_view(0);};
+  iterator begin() const {return &DV::h_view(0);}

-  iterator end() const {return &DV::h_view(_size);};
+  iterator end() const {return &DV::h_view(_size);}

+  reference front() {return DV::h_view(0);}
+
+  reference back() {return DV::h_view(_size - 1);}
+
+  const_reference front() const {return DV::h_view(0);}
+
+  const_reference back() const {return DV::h_view(_size - 1);}

  /* std::algorithms wich work originally with iterators, here they are implemented as member functions */

--- a/lib/kokkos/containers/unit_tests/Makefile
+++ b/lib/kokkos/containers/unit_tests/Makefile
@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	TEST_TARGETS += test-cuda
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+	OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_ROCm
+	TEST_TARGETS += test-rocm
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosContainers_UnitTest_Threads
@ -51,6 +57,9 @@ endif
 KokkosContainers_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Cuda

+KokkosContainers_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_ROCm
+
 KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Threads

@ -63,6 +72,9 @@ KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
 test-cuda: KokkosContainers_UnitTest_Cuda
 	./KokkosContainers_UnitTest_Cuda

+test-rocm: KokkosContainers_UnitTest_ROCm
+	./KokkosContainers_UnitTest_ROCm
+
 test-threads: KokkosContainers_UnitTest_Threads
 	./KokkosContainers_UnitTest_Threads

--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@ -131,11 +131,14 @@ struct TestDynamicView

 // printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size);

+    const size_t total_alloc_size = arg_total_size * sizeof(Scalar) * 1.2 ;
+    const size_t superblock = std::min( total_alloc_size , size_t(1000000) );
+
    memory_pool_type pool( memory_space()
-                         , arg_total_size * sizeof(Scalar) * 1.2
+                         , total_alloc_size
                         ,     500 /* min block size in bytes */
                         ,   30000 /* max block size in bytes */
-                         , 1000000 /* min superblock size in bytes */
+                         , superblock
                         );

 // printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size);
--- a/lib/kokkos/containers/unit_tests/TestROCm.cpp
+++ b/lib/kokkos/containers/unit_tests/TestROCm.cpp
@ -0,0 +1,263 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <iostream>
+#include <iomanip>
+#include <cstdint>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestDynamicView.hpp>
+
+#include <Kokkos_DynRankView.hpp>
+#include <TestDynViewAPI.hpp>
+
+#include <Kokkos_ErrorReporter.hpp>
+#include <TestErrorReporter.hpp>
+
+#include <TestViewCtorPropEmbeddedDim.hpp>
+
+//----------------------------------------------------------------------------
+
+
+
+namespace Test {
+
+class rocm : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Experimental::ROCm::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+#if !defined(KOKKOS_ENABLE_ROCM)
+//issue 964
+TEST_F( rocm , dyn_view_api) {
+  TestDynViewAPI< double , Kokkos::Experimental::ROCm >();
+}
+#endif 
+
+TEST_F( rocm, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Experimental::ROCm >::test_vcpt( 2, 3 );
+}
+
+TEST_F( rocm , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Experimental::ROCm >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Experimental::ROCm >();
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 100000);
+}
+
+
+#if !defined(KOKKOS_ENABLE_ROCM)
+// issue 1089
+// same as 130203 (MemPool, static member function link issue
+void rocm_test_insert_close(  uint32_t num_nodes
+                            , uint32_t num_inserts
+                            , uint32_t num_duplicates
+                           )
+{
+  test_insert< Kokkos::Experimental::ROCm >( num_nodes, num_inserts, num_duplicates, true);
+}
+
+// hcc link error , Referencing function in another module!
+void rocm_test_insert_far(  uint32_t num_nodes
+                          , uint32_t num_inserts
+                          , uint32_t num_duplicates
+                         )
+{
+  test_insert< Kokkos::Experimental::ROCm >( num_nodes, num_inserts, num_duplicates, false);
+}
+
+void rocm_test_failed_insert(  uint32_t num_nodes )
+{
+  test_failed_insert< Kokkos::Experimental::ROCm >( num_nodes );
+}
+
+void rocm_test_deep_copy(  uint32_t num_nodes )
+{
+  test_deep_copy< Kokkos::Experimental::ROCm >( num_nodes );
+}
+
+void rocm_test_vector_combinations(unsigned int size)
+{
+  test_vector_combinations<int,Kokkos::Experimental::ROCm>(size);
+}
+
+void rocm_test_dualview_combinations(unsigned int size)
+{
+  test_dualview_combinations<int,Kokkos::Experimental::ROCm>(size);
+}
+
+void rocm_test_bitset()
+{
+  test_bitset<Kokkos::Experimental::ROCm>();
+}
+
+
+
+/*TEST_F( rocm, bitset )
+{
+  rocm_test_bitset();
+}*/
+
+#define ROCM_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat )                                \
+  TEST_F( rocm, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      rocm_test_insert_##name(num_nodes,num_inserts,num_duplicates);                                            \
+  }
+
+#define ROCM_FAILED_INSERT_TEST( num_nodes, repeat )                           \
+  TEST_F( rocm, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      rocm_test_failed_insert(num_nodes);                                      \
+  }
+
+#define ROCM_ASSIGNEMENT_TEST( num_nodes, repeat )                               \
+  TEST_F( rocm, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {  \
+    for (int i=0; i<repeat; ++i)                                                 \
+      rocm_test_assignment_operators(num_nodes);                                 \
+  }
+
+#define ROCM_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( rocm, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      rocm_test_deep_copy(num_nodes);                     \
+  }
+
+#define ROCM_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( rocm, vector_combination##size##x) {       \
+      rocm_test_vector_combinations(size);                     \
+  }
+
+#define ROCM_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( rocm, dualview_combination##size##x) {       \
+      rocm_test_dualview_combinations(size);                     \
+  }
+
+//ROCM_DUALVIEW_COMBINE_TEST( 10 )
+//ROCM_VECTOR_COMBINE_TEST( 10 )
+//ROCM_VECTOR_COMBINE_TEST( 3057 )
+
+
+//ROCM_INSERT_TEST(close,               100000, 90000, 100, 500)
+//ROCM_INSERT_TEST(far,                 100000, 90000, 100, 500)
+//ROCM_DEEP_COPY( 10000, 1 )
+//ROCM_FAILED_INSERT_TEST( 10000, 1000 )
+
+
+#undef ROCM_INSERT_TEST
+#undef ROCM_FAILED_INSERT_TEST
+#undef ROCM_ASSIGNEMENT_TEST
+#undef ROCM_DEEP_COPY
+#undef ROCM_VECTOR_COMBINE_TEST
+#undef ROCM_DUALVIEW_COMBINE_TEST
+
+
+#endif
+#if !defined(KOKKOS_ENABLE_ROCM)
+//static member function issue 
+TEST_F( rocm , dynamic_view )
+{
+//  typedef TestDynamicView< double , Kokkos::ROCmUVMSpace >
+  typedef TestDynamicView< double , Kokkos::Experimental::ROCmSpace >
+    TestDynView ;
+
+  for ( int i = 0 ; i < 10 ; ++i ) {
+    TestDynView::run( 100000 + 100 * i );
+  }
+}
+#endif
+
+
+#if defined(KOKKOS_CLASS_LAMBDA)
+TEST_F(rocm, ErrorReporterViaLambda)
+{
+  TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Experimental::ROCm>>();
+}
+#endif
+
+TEST_F(rocm, ErrorReporter)
+{
+  TestErrorReporter<ErrorReporterDriver<Kokkos::Experimental::ROCm>>();
+}
+
+}
+
+#else
+void KOKKOS_CONTAINERS_UNIT_TESTS_TESTROCM_PREVENT_EMPTY_LINK_ERROR() {}
+#endif  /* #ifdef KOKKOS_ENABLE_ROCM */
+
--- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
+++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@ -83,6 +83,16 @@ void run_test_graph()
      ASSERT_EQ( (int) hx.entries( j + begin ) , graph[i][j] );
    }
  }
+
+  // Test row view access
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    auto rowView = hx.rowConst(i);
+    ASSERT_EQ( rowView.length, graph[i].size() );
+    for ( size_t j = 0 ; j < rowView.length ; ++j ) {
+      ASSERT_EQ( rowView.colidx( j ) , graph[i][j] );
+      ASSERT_EQ( rowView( j )        , graph[i][j] );
+    }
+  }
 }

 template< class Space >
@ -182,5 +192,6 @@ void run_test_graph3(size_t B, size_t N)
    ASSERT_FALSE((ne>2*((hx.row_map(hx.numRows())+C*hx.numRows())/B))&&(hx.row_block_offsets(i+1)>hx.row_block_offsets(i)+1));
  }
 }
+
 } /* namespace TestStaticCrsGraph */

--- a/lib/kokkos/core/perf_test/Makefile
+++ b/lib/kokkos/core/perf_test/Makefile
@ -43,6 +43,7 @@ TEST_TARGETS += test-atomic

 #

+ifneq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
 OBJ_MEMPOOL = test_mempool.o 
 TARGETS += KokkosCore_PerformanceTest_Mempool
 TEST_TARGETS += test-mempool
@ -52,6 +53,7 @@ TEST_TARGETS += test-mempool
 OBJ_TASKDAG = test_taskdag.o 
 TARGETS += KokkosCore_PerformanceTest_TaskDAG
 TEST_TARGETS += test-taskdag
+endif

 #

--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
@ -63,7 +63,7 @@
 #include <typeinfo>
 #endif

-namespace Kokkos { namespace Experimental { namespace Impl {
+namespace Kokkos { namespace Impl {

 // ------------------------------------------------------------------ //

@ -110,21 +110,12 @@ struct apply_impl<2,RP,Functor,void >
  {
 // LL
  if (RP::inner_direction == RP::Left) {
- /*
-    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
-    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
-
-    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
-    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
-            m_func(i, j);
-    } }
-*/
    for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
      if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

        for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
          if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
            m_func(offset_0 , offset_1);
          }
@ -134,21 +125,12 @@ struct apply_impl<2,RP,Functor,void >
  }
 // LR
  else {
-/*
-    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
-    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
-
-    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
-    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
-            m_func(i, j);
-    } }
-*/
    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {

        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
            m_func(offset_0 , offset_1);
          }
@ -182,21 +164,12 @@ struct apply_impl<2,RP,Functor,Tag>
  {
  if (RP::inner_direction == RP::Left) {
    // Loop over size maxnumblocks until full range covered
-/*
-    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
-    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
-
-    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
-    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
-            m_func(Tag(), i, j);
-    } }
-*/
    for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
      if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

        for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
          if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
            m_func(Tag(), offset_0 , offset_1);
          }
@ -205,21 +178,12 @@ struct apply_impl<2,RP,Functor,Tag>
    }
  }
  else {
-/*
-    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
-    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
-
-    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
-    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
-            m_func(Tag(), i, j);
-    } }
-*/
    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {

        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
            m_func(Tag(), offset_0 , offset_1);
          }
@ -255,15 +219,15 @@ struct apply_impl<3,RP,Functor,void >
 // LL
    if (RP::inner_direction == RP::Left) {
      for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
        if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {

          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

              for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
                  m_func(offset_0 , offset_1 , offset_2);
                }
@ -276,15 +240,15 @@ struct apply_impl<3,RP,Functor,void >
 // LR
  else {
    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {

        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

            for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-              const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+              const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
              if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
                m_func(offset_0 , offset_1 , offset_2);
              }
@ -319,15 +283,15 @@ struct apply_impl<3,RP,Functor,Tag>
  {
    if (RP::inner_direction == RP::Left) {
      for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
        if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {

          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

              for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
                  m_func(Tag(), offset_0 , offset_1 , offset_2);
                }
@ -339,15 +303,15 @@ struct apply_impl<3,RP,Functor,Tag>
    }
    else {
      for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {

          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

              for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
                  m_func(Tag(), offset_0 , offset_1 , offset_2);
                }
@ -398,19 +362,19 @@ struct apply_impl<4,RP,Functor,void >
      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];

      for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
        if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {

          for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
            if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {

              for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                  for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -436,19 +400,19 @@ struct apply_impl<4,RP,Functor,void >
      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {

                  for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -498,19 +462,19 @@ struct apply_impl<4,RP,Functor,Tag>
      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];

      for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
        if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {

          for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
            if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {

              for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                  for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                      m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -535,19 +499,19 @@ struct apply_impl<4,RP,Functor,Tag>
      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {

                  for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
                      m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -612,23 +576,23 @@ struct apply_impl<5,RP,Functor,void >
      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];

      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {

          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                          m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                        }
@ -667,23 +631,23 @@ struct apply_impl<5,RP,Functor,void >
      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
                          m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                        }
@ -747,23 +711,23 @@ struct apply_impl<5,RP,Functor,Tag>
      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];

      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {

          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                        }
@ -802,23 +766,23 @@ struct apply_impl<5,RP,Functor,Tag>
      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                        }
@ -895,27 +859,27 @@ struct apply_impl<6,RP,Functor,void >
      const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];

      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {

          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                              m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                            }
@ -967,27 +931,27 @@ struct apply_impl<6,RP,Functor,void >
      const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                              m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                            }
@ -1064,27 +1028,27 @@ struct apply_impl<6,RP,Functor,Tag>
      const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];

      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {

          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                            }
@ -1136,27 +1100,27 @@ struct apply_impl<6,RP,Functor,Tag>
      const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                            }
@ -1292,7 +1256,7 @@ protected:
  const Functor    m_func;
 };

-} } } //end namespace Kokkos::Experimental::Impl
+} } //end namespace Kokkos::Impl

 #endif
 #endif
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
@ -63,7 +63,7 @@
 #include <typeinfo>
 #endif

-namespace Kokkos { namespace Experimental { namespace Impl {
+namespace Kokkos { namespace Impl {

 namespace Refactor {

@ -2709,7 +2709,7 @@ private:

 // ----------------------------------------------------------------------------------

-} } } //end namespace Kokkos::Experimental::Impl
+} } //end namespace Kokkos::Impl

 #endif
 #endif
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()

 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-//__launch_bounds__(maxTperB, minBperSM)
+__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_constant_memory()
 {
  const DriverType & driver =
@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )

 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-//__launch_bounds__(maxTperB, minBperSM)
+__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_local_memory( const DriverType driver )
 {
  driver();
@ -193,9 +193,14 @@ template < class DriverType
         , bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
 struct CudaParallelLaunch ;

-template < class DriverType, class LaunchBounds >
-struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
-
+template < class DriverType
+         , unsigned int MaxThreadsPerBlock
+         , unsigned int MinBlocksPerSM >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds< MaxThreadsPerBlock 
+                                               , MinBlocksPerSM >
+                         , true >
+{
  inline
  CudaParallelLaunch( const DriverType & driver
                    , const dim3       & grid
@ -216,21 +221,28 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
      }
-      #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
-      else if ( shmem ) {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
-      } else {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_constant_memory
+                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
      }
      #endif

      // Copy functor to constant memory on the device
-      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
+      cudaMemcpyToSymbol(
+        kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );

      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();

      // Invoke the driver function on the device
-      cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
+      cuda_parallel_launch_constant_memory
+        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+          <<< grid , block , shmem , stream >>>();

 #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
      CUDA_SAFE_CALL( cudaGetLastError() );
@ -240,9 +252,11 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
  }
 };

-template < class DriverType, class LaunchBounds >
-struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
-
+template < class DriverType >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds<>
+                         , true >
+{
  inline
  CudaParallelLaunch( const DriverType & driver
                    , const dim3       & grid
@ -252,20 +266,136 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
  {
    if ( grid.x && ( block.x * block.y * block.z ) ) {

+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      // Fence before changing settings and copying closure
+      Kokkos::Cuda::fence();
+
      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
      }
-      #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
-      else if ( shmem ) {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
-      } else {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_constant_memory< DriverType >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbol(
+        kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory< DriverType >
+          <<< grid , block , shmem , stream >>>();
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+template < class DriverType
+         , unsigned int MaxThreadsPerBlock
+         , unsigned int MinBlocksPerSM >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds< MaxThreadsPerBlock 
+                                               , MinBlocksPerSM >
+                         , false >
+{
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_local_memory
+                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
      }
      #endif

      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();

-      cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
+      // Invoke the driver function on the device
+      cuda_parallel_launch_local_memory
+        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+          <<< grid , block , shmem , stream >>>( driver );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds<>
+                         , false >
+{
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_local_memory< DriverType >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_local_memory< DriverType >
+          <<< grid , block , shmem , stream >>>( driver );

 #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
      CUDA_SAFE_CALL( cudaGetLastError() );
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -713,7 +713,7 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void *
 // Iterate records to print orphaned memory ...
 void
 SharedAllocationRecord< Kokkos::CudaSpace , void >::
-print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail )
+print_records( std::ostream & s , const Kokkos::CudaSpace & , bool detail )
 {
  SharedAllocationRecord< void , void > * r = & s_root_record ;

@ -751,7 +751,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
              , reinterpret_cast<uintptr_t>( r->m_dealloc )
              , head.m_label
              );
-      std::cout << buffer ;
+      s << buffer ;
      r = r->m_next ;
    } while ( r != & s_root_record );
  }
@ -781,7 +781,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
      else {
        snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" );
      }
-      std::cout << buffer ;
+      s << buffer ;
      r = r->m_next ;
    } while ( r != & s_root_record );
  }
@ -789,14 +789,14 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail

 void
 SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
-print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail )
+print_records( std::ostream & s , const Kokkos::CudaUVMSpace & , bool detail )
 {
  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail );
 }

 void
 SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
-print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail )
+print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & , bool detail )
 {
  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
 }
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -421,7 +421,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
      std::string msg = ss.str();
      Kokkos::abort( msg.c_str() );
    }
-    if ( compiled_major != cudaProp.major || compiled_minor != cudaProp.minor ) {
+    if ( Kokkos::show_warnings() && (compiled_major != cudaProp.major || compiled_minor != cudaProp.minor) ) {
      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
                << compiled_major << "." << compiled_minor
                << " on device with compute capability "
@ -467,7 +467,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )

    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;

-    if ( ! m_scratchUnifiedSupported ) {
+    if ( Kokkos::show_warnings() && ! m_scratchUnifiedSupported ) {
      std::cout << "Kokkos::Cuda device "
                << cudaProp.name << " capability "
                << cudaProp.major << "." << cudaProp.minor
@ -545,7 +545,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
  }

  #ifdef KOKKOS_ENABLE_CUDA_UVM
-    if(!cuda_launch_blocking()) {
+    if( Kokkos::show_warnings() && !cuda_launch_blocking() ) {
      std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
      std::cout << "                                  without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
      std::cout << "                                  The code must call Cuda::fence() after each kernel" << std::endl;
@ -561,7 +561,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
    bool visible_devices_one=true;
    if (env_visible_devices == 0) visible_devices_one=false;

-    if(!visible_devices_one && !force_device_alloc) {
+    if( Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc) ) {
      std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
      std::cout << "                                  without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
      std::cout << "                                  setting CUDA_VISIBLE_DEVICES." << std::endl;
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@ -381,12 +381,12 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType
-                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::MDRangePolicy< Traits ... >
                 , Kokkos::Cuda
                 >
 {
 private:
-  typedef Kokkos::Experimental::MDRangePolicy< Traits ...  > Policy ;
+  typedef Kokkos::MDRangePolicy< Traits ...  > Policy ;
  using RP = Policy;
  typedef typename Policy::array_index_type array_index_type;
  typedef typename Policy::index_type index_type;
@ -402,7 +402,7 @@ public:
  __device__
  void operator()(void) const
    {
-      Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
+      Kokkos::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
    }


@ -858,14 +858,14 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
-                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , Kokkos::MDRangePolicy< Traits ... >
                    , ReducerType
                    , Kokkos::Cuda
                    >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > Policy ;
  typedef typename Policy::array_index_type                 array_index_type;
  typedef typename Policy::index_type                       index_type;

@ -898,7 +898,7 @@ public:
  size_type *         m_scratch_flags ;
  size_type *         m_unified_space ;

-  typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
+  typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;

  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
@ -913,7 +913,7 @@ public:
  void
  exec_range( reference_type update ) const
  {
-    Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
+    Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
  }

  inline
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -127,11 +127,11 @@ struct CudaTextureFetch {
  template< class CudaMemorySpace >
  inline explicit
  CudaTextureFetch( const ValueType * const arg_ptr
-                  , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
+                  , Kokkos::Impl::SharedAllocationRecord< CudaMemorySpace , void > * record
                  )
-    : m_obj( record.template attach_texture_object< AliasType >() )
+    : m_obj( record->template attach_texture_object< AliasType >() )
    , m_ptr( arg_ptr )
-    , m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
+    , m_offset( record->attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
    {}

  // Texture object spans the entire allocation.
@ -199,7 +199,7 @@ struct CudaLDGFetch {
  template< class CudaMemorySpace >
  inline explicit
  CudaLDGFetch( const ValueType * const arg_ptr
-                  , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
+              , Kokkos::Impl::SharedAllocationRecord<CudaMemorySpace,void>*
              )
    : m_ptr( arg_ptr )
    {}
@ -285,7 +285,21 @@ public:
      // Assignment of texture = non-texture requires creation of a texture object
      // which can only occur on the host.  In addition, 'get_record' is only valid
      // if called in a host execution space
-      return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
+
+
+      typedef typename Traits::memory_space memory_space ;
+      typedef typename Impl::SharedAllocationRecord<memory_space,void> record ;
+
+      record * const r = arg_tracker.template get_record< memory_space >();
+
+#if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
+      if ( 0 == r ) {
+        Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
+      }
+#endif
+
+      return handle_type( arg_data_ptr , r );
+
 #else
      Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
      return handle_type();
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@ -48,48 +48,50 @@ namespace Kokkos {
 namespace Impl {

 template< class FunctorType , class ... Traits >
-class ParallelFor< FunctorType ,
-                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
-                   Kokkos::Cuda
-                 >
-  : public Kokkos::Impl::Experimental::
-           WorkGraphExec< FunctorType,
-                          Kokkos::Cuda,
-                          Traits ...
+class ParallelFor< FunctorType
+                 , Kokkos::WorkGraphPolicy< Traits ... >
+                 , Kokkos::Cuda
                 >
 {
 public:

-  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... >   Policy ;
-  typedef Kokkos::Impl::Experimental::
-          WorkGraphExec<FunctorType, Kokkos::Cuda, Traits ... > Base ;
+  typedef Kokkos::WorkGraphPolicy< Traits ... >   Policy ;
  typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda>        Self ;

 private:

-  template< class TagType >
-  __device__
-  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    Base::m_functor( i );
-  }
+  Policy       m_policy ;
+  FunctorType  m_functor ;

  template< class TagType >
-  __device__
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }
+
+  template< class TagType >
+  __device__ inline
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    const TagType t{} ;
-    Base::m_functor( t , i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{} ; m_functor( t , w ); }

 public:

-  __device__
-  inline
-  void operator()() const {
-    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
-      exec_one< typename Policy::work_tag >( i );
-      Base::after_work(i);
+  __device__ inline
+  void operator()() const noexcept
+    {
+      if ( 0 == ( threadIdx.y % 16 ) ) {
+
+        // Spin until COMPLETED_TOKEN.
+        // END_TOKEN indicates no work is currently available.
+
+        for ( std::int32_t w = Policy::END_TOKEN ;
+              Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+          if ( Policy::END_TOKEN != w ) {
+            exec_one< typename Policy::work_tag >( w );
+            m_policy.completed_work(w);
+          }
+        }
      }
    }

@ -108,9 +110,9 @@ public:
  inline
  ParallelFor( const FunctorType & arg_functor
             , const Policy      & arg_policy )
-    : Base( arg_functor, arg_policy )
-  {
-  }
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+  {}
 };

 } // namespace Impl
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@ -55,7 +55,7 @@
 #include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
 #endif

-namespace Kokkos { namespace Experimental {
+namespace Kokkos {

 // ------------------------------------------------------------------ //

@ -331,11 +331,23 @@ struct MDRangePolicy
  }

 };
+
+} // namespace Kokkos
+
+// For backward compatibility
+namespace Kokkos { namespace Experimental {
+  using Kokkos::MDRangePolicy;
+  using Kokkos::Rank;
+  using Kokkos::Iterate;
+} } // end Kokkos::Experimental
 // ------------------------------------------------------------------ //

 // ------------------------------------------------------------------ //
 //md_parallel_for - deprecated use parallel_for
 // ------------------------------------------------------------------ //
+
+namespace Kokkos { namespace Experimental {
+
 template <typename MDRange, typename Functor, typename Enable = void>
 void md_parallel_for( MDRange const& range
                    , Functor const& f
@ -347,7 +359,7 @@ void md_parallel_for( MDRange const& range
                      ) >::type* = 0
                    )
 {
-  Impl::MDFunctor<MDRange, Functor, void> g(range, f);
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);

  using range_policy = typename MDRange::impl_range_policy;

@ -365,7 +377,7 @@ void md_parallel_for( const std::string& str
                      ) >::type* = 0
                    )
 {
-  Impl::MDFunctor<MDRange, Functor, void> g(range, f);
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);

  using range_policy = typename MDRange::impl_range_policy;

@ -385,7 +397,7 @@ void md_parallel_for( const std::string& str
                      ) >::type* = 0
                    )
 {
-  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
  closure.execute();
 }

@ -400,7 +412,7 @@ void md_parallel_for( MDRange const& range
                      ) >::type* = 0
                    )
 {
-  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
  closure.execute();
 }
 #endif
@ -421,7 +433,7 @@ void md_parallel_reduce( MDRange const& range
                      ) >::type* = 0
                    )
 {
-  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);

  using range_policy = typename MDRange::impl_range_policy;
  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
@ -439,7 +451,7 @@ void md_parallel_reduce( const std::string& str
                      ) >::type* = 0
                    )
 {
-  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);

  using range_policy = typename MDRange::impl_range_policy;

--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@ -81,10 +81,10 @@ struct IndexType

 /**\brief Specify Launch Bounds for CUDA execution.
 *
- *  The "best" defaults may be architecture specific.
+ *  If no launch bounds specified then do not set launch bounds.
 */
-template< unsigned int maxT = 1024 /* Max threads per block */
-        , unsigned int minB = 1    /* Min blocks per SM */
+template< unsigned int maxT = 0 /* Max threads per block */
+        , unsigned int minB = 0 /* Min blocks per SM */
        >
 struct LaunchBounds
 {
@ -280,6 +280,9 @@ struct MemorySpaceAccess {
  enum { deepcopy = assignable };
 };

+}} // namespace Kokkos::Impl
+
+namespace Kokkos {

 /**\brief  Can AccessSpace access MemorySpace ?
 *
@ -358,6 +361,13 @@ public:
    >::type  space ;
 };

+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+
+using Kokkos::SpaceAccessibility ; // For backward compatibility
+
 }} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -99,13 +99,17 @@ struct InitArguments {
  int num_threads;
  int num_numa;
  int device_id;
+  bool disable_warnings;

  InitArguments( int nt = -1
               , int nn = -1
-               , int dv = -1)
-    : num_threads( nt )
-    , num_numa( nn )
-    , device_id( dv )
+               , int dv = -1
+               , bool dw = false
+               )
+    : num_threads{ nt }
+    , num_numa{ nn }
+    , device_id{ dv }
+    , disable_warnings{ dw }
  {}
 };

@ -113,6 +117,10 @@ void initialize(int& narg, char* arg[]);

 void initialize(const InitArguments& args = InitArguments());

+bool is_initialized() noexcept;
+
+bool show_warnings() noexcept;
+
 /** \brief  Finalize the spaces that were initialized via Kokkos::initialize */
 void finalize();

--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@ -45,7 +45,6 @@
 #define KOKKOS_CRS_HPP

 namespace Kokkos {
-namespace Experimental {

 /// \class Crs
 /// \brief Compressed row storage array.
@ -164,7 +163,7 @@ void transpose_crs(
    Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in);

-}} // namespace Kokkos::Experimental
+} // namespace Kokkos

 /*--------------------------------------------------------------------------*/

@ -172,7 +171,6 @@ void transpose_crs(

 namespace Kokkos {
 namespace Impl {
-namespace Experimental {

 template <class InCrs, class OutCounts>
 class GetCrsTransposeCounts {
@ -277,14 +275,13 @@ class FillCrsTransposeEntries {
  }
 };

-}}} // namespace Kokkos::Impl::Experimental
+}} // namespace Kokkos::Impl

 /*--------------------------------------------------------------------------*/

 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
-namespace Experimental {

 template< class OutCounts,
          class DataType,
@ -297,8 +294,7 @@ void get_crs_transpose_counts(
    std::string const& name) {
  using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>;
  out = OutCounts(name, in.numRows());
-  Kokkos::Impl::Experimental::
-    GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
+  Kokkos::Impl::GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
 }

 template< class OutRowMap,
@ -308,8 +304,7 @@ typename OutRowMap::value_type get_crs_row_map_from_counts(
    InCounts const& in,
    std::string const& name) {
  out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
-  Kokkos::Impl::Experimental::
-    CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
+  Kokkos::Impl::CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
  return functor.execute();
 }

@ -326,32 +321,37 @@ void transpose_crs(
  typedef View<SizeType*, memory_space>               counts_type ;
  {
  counts_type counts;
-  Kokkos::Experimental::get_crs_transpose_counts(counts, in);
-  Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts,
+  Kokkos::get_crs_transpose_counts(counts, in);
+  Kokkos::get_crs_row_map_from_counts(out.row_map, counts,
      "tranpose_row_map");
  }
  out.entries = decltype(out.entries)("transpose_entries", in.entries.size());
-  Kokkos::Impl::Experimental::
+  Kokkos::Impl::
    FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
 }

 template< class CrsType,
-          class Functor>
-struct CountAndFill {
+          class Functor,
+          class ExecutionSpace = typename CrsType::execution_space>
+struct CountAndFillBase;
+
+template< class CrsType,
+          class Functor,
+          class ExecutionSpace>
+struct CountAndFillBase {
  using data_type = typename CrsType::size_type;
  using size_type = typename CrsType::size_type;
  using row_map_type = typename CrsType::row_map_type;
-  using entries_type = typename CrsType::entries_type;
  using counts_type = row_map_type;
  CrsType m_crs;
  Functor m_functor;
  counts_type m_counts;
  struct Count {};
-  KOKKOS_INLINE_FUNCTION void operator()(Count, size_type i) const {
+  inline void operator()(Count, size_type i) const {
    m_counts(i) = m_functor(i, nullptr);
  }
  struct Fill {};
-  KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
+  inline void operator()(Fill, size_type i) const {
    auto j = m_crs.row_map(i);
    /* we don't want to access entries(entries.size()), even if its just to get its
       address and never use it.
@ -363,13 +363,63 @@ struct CountAndFill {
      nullptr : (&(m_crs.entries(j)));
    m_functor(i, fill);
  }
-  using self_type = CountAndFill<CrsType, Functor>;
-  CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
+  CountAndFillBase(CrsType& crs, Functor const& f):
    m_crs(crs),
    m_functor(f)
+  {}
+};
+
+#if defined( KOKKOS_ENABLE_CUDA )
+template< class CrsType,
+          class Functor>
+struct CountAndFillBase<CrsType, Functor, Kokkos::Cuda> {
+  using data_type = typename CrsType::size_type;
+  using size_type = typename CrsType::size_type;
+  using row_map_type = typename CrsType::row_map_type;
+  using counts_type = row_map_type;
+  CrsType m_crs;
+  Functor m_functor;
+  counts_type m_counts;
+  struct Count {};
+  __device__ inline void operator()(Count, size_type i) const {
+    m_counts(i) = m_functor(i, nullptr);
+  }
+  struct Fill {};
+  __device__ inline void operator()(Fill, size_type i) const {
+    auto j = m_crs.row_map(i);
+    /* we don't want to access entries(entries.size()), even if its just to get its
+       address and never use it.
+       this can happen when row (i) is empty and all rows after it are also empty.
+       we could compare to row_map(i + 1), but that is a read from global memory,
+       whereas dimension_0() should be part of the View in registers (or constant memory) */
+    data_type* fill =
+      (j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
+      nullptr : (&(m_crs.entries(j)));
+    m_functor(i, fill);
+  }
+  CountAndFillBase(CrsType& crs, Functor const& f):
+    m_crs(crs),
+    m_functor(f)
+  {}
+};
+#endif
+
+template< class CrsType,
+          class Functor>
+struct CountAndFill : public CountAndFillBase<CrsType, Functor> {
+  using base_type = CountAndFillBase<CrsType, Functor>;
+  using typename base_type::data_type;
+  using typename base_type::size_type;
+  using typename base_type::counts_type;
+  using typename base_type::Count;
+  using typename base_type::Fill;
+  using entries_type = typename CrsType::entries_type;
+  using self_type = CountAndFill<CrsType, Functor>;
+  CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
+    base_type(crs, f)
  {
    using execution_space = typename CrsType::execution_space;
-    m_counts = counts_type("counts", nrows);
+    this->m_counts = counts_type("counts", nrows);
    {
    using count_policy_type = RangePolicy<size_type, execution_space, Count>;
    using count_closure_type =
@ -377,10 +427,10 @@ struct CountAndFill {
    const count_closure_type closure(*this, count_policy_type(0, nrows));
    closure.execute();
    }
-    auto nentries = Kokkos::Experimental::
-      get_crs_row_map_from_counts(m_crs.row_map, m_counts);
-    m_counts = counts_type();
-    m_crs.entries = entries_type("entries", nentries);
+    auto nentries = Kokkos::
+      get_crs_row_map_from_counts(this->m_crs.row_map, this->m_counts);
+    this->m_counts = counts_type();
+    this->m_crs.entries = entries_type("entries", nentries);
    {
    using fill_policy_type = RangePolicy<size_type, execution_space, Fill>;
    using fill_closure_type =
@ -388,7 +438,7 @@ struct CountAndFill {
    const fill_closure_type closure(*this, fill_policy_type(0, nrows));
    closure.execute();
    }
-    crs = m_crs;
+    crs = this->m_crs;
  }
 };

@ -398,9 +448,9 @@ void count_and_fill_crs(
    CrsType& crs,
    typename CrsType::size_type nrows,
    Functor const& f) {
-  Kokkos::Experimental::CountAndFill<CrsType, Functor>(crs, nrows, f);
+  Kokkos::CountAndFill<CrsType, Functor>(crs, nrows, f);
 }

-}} // namespace Kokkos::Experimental
+} // namespace Kokkos

 #endif /* #define KOKKOS_CRS_HPP */
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@ -379,12 +379,13 @@ Impl::PerThreadValue PerThread(const int& arg);
 *  uses variadic templates. Each and any of the template arguments can
 *  be omitted.
 *
- *  Possible Template arguments and there default values:
+ *  Possible Template arguments and their default values:
 *    ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be enabled.
 *    WorkTag (none): Tag which is used as the first argument for the functor operator.
 *    Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
 *    IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
- *    LaunchBounds<int,int> (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation.
+ *    LaunchBounds<unsigned,unsigned> Launch Bounds for CUDA compilation,
+ *    default of LaunchBounds<0,0> indicates no launch bounds specified.
 */
 template< class ... Properties>
 class TeamPolicy: public
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -274,18 +274,14 @@
    #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
  #endif

+  #if ! defined( KOKKOS_MEMORY_ALIGNMENT )
+    #define KOKKOS_MEMORY_ALIGNMENT 64
+  #endif
+
  #define KOKKOS_RESTRICT __restrict__

-  #ifndef KOKKOS_ALIGN
-    #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
-  #endif
-
-  #ifndef KOKKOS_ALIGN_PTR
-    #define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size)))
-  #endif
-
-  #ifndef KOKKOS_ALIGN_SIZE
-    #define KOKKOS_ALIGN_SIZE 64
+  #ifndef KOKKOS_IMPL_ALIGN_PTR
+    #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size)))
  #endif

  #if ( 1400 > KOKKOS_COMPILER_INTEL )
@ -351,6 +347,11 @@
  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
    #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
  #endif
+
+  #if !defined( KOKKOS_IMPL_ALIGN_PTR )
+    #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((aligned(size)))
+  #endif
+
 #endif

 //----------------------------------------------------------------------------
@ -426,16 +427,16 @@
 //----------------------------------------------------------------------------
 // Define Macro for alignment:

-#if !defined KOKKOS_ALIGN_SIZE
-  #define KOKKOS_ALIGN_SIZE 16
+#if ! defined( KOKKOS_MEMORY_ALIGNMENT )
+  #define KOKKOS_MEMORY_ALIGNMENT 16
 #endif

-#if !defined( KOKKOS_ALIGN )
-  #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
+#if ! defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
+  #define KOKKOS_MEMORY_ALIGNMENT_THRESHOLD 4
 #endif

-#if !defined( KOKKOS_ALIGN_PTR )
-  #define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
+#if !defined( KOKKOS_IMPL_ALIGN_PTR )
+  #define KOKKOS_IMPL_ALIGN_PTR(size) /* */
 #endif

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@ -111,6 +111,10 @@ private:

 public:

+  /**\brief  The maximum size of a superblock and block */
+  enum : uint32_t { max_superblock_size      = 1LU << 31 /* 2 gigabytes */ };
+  enum : uint32_t { max_block_per_superblock = max_bit_count };
+
  //--------------------------------------------------------------------------

  KOKKOS_INLINE_FUNCTION
@ -206,7 +210,7 @@ public:
      const uint32_t * sb_state_ptr = sb_state_array ;

      s << "pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) << ")"
-        << " superblock_size(" << ( 1 << m_sb_size_lg2 ) << ")" << std::endl ;
+        << " superblock_size(" << ( 1LU << m_sb_size_lg2 ) << ")" << std::endl ;

      for ( int32_t i = 0 ; i < m_sb_count
          ; ++i , sb_state_ptr += m_sb_state_size ) {
@ -215,7 +219,7 @@ public:

          const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
          const uint32_t block_size_lg2  = m_sb_size_lg2 - block_count_lg2 ;
-          const uint32_t block_count     = 1 << block_count_lg2 ;
+          const uint32_t block_count     = 1u << block_count_lg2 ;
          const uint32_t block_used      = (*sb_state_ptr) & state_used_mask ;

          s << "Superblock[ " << i << " / " << m_sb_count << " ] {"
@ -284,43 +288,87 @@ public:
    {
      const uint32_t int_align_lg2   = 3 ; /* align as int[8] */
      const uint32_t int_align_mask  = ( 1u << int_align_lg2 ) - 1 ;
+      const uint32_t default_min_block_size       = 1u << 6  ; /* 64 bytes */
+      const uint32_t default_max_block_size       = 1u << 12 ;/* 4k bytes */
+      const uint32_t default_min_superblock_size  = 1u << 20 ;/* 1M bytes */

-      // Constraints and defaults:
-      //   min_block_alloc_size <= max_block_alloc_size
-      //   max_block_alloc_size <= min_superblock_size 
-      //   min_superblock_size  <= min_total_alloc_size
+      //--------------------------------------------------
+      // Default block and superblock sizes:

-      const uint32_t MIN_BLOCK_SIZE  = 1u << 6   /*   64 bytes */ ;
-      const uint32_t MAX_BLOCK_SIZE  = 1u << 12  /*   4k bytes */ ;
+      if ( 0 == min_block_alloc_size ) {
+        // Default all sizes:

-      if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ;
+        min_superblock_size =
+          std::min( size_t(default_min_superblock_size)
+                  , min_total_alloc_size );
+
+        min_block_alloc_size =
+          std::min( size_t(default_min_block_size)
+                  , min_superblock_size );
+
+        max_block_alloc_size =
+          std::min( size_t(default_max_block_size)
+                  , min_superblock_size );
+      }
+      else if ( 0 == min_superblock_size ) {
+
+        // Choose superblock size as minimum of:
+        //   max_block_per_superblock * min_block_size
+        //   max_superblock_size
+        //   min_total_alloc_size
+
+        const size_t max_superblock =
+          min_block_alloc_size * max_block_per_superblock ;
+
+        min_superblock_size =
+          std::min( max_superblock ,
+          std::min( size_t(max_superblock_size)
+                  , min_total_alloc_size ) );
+      }

      if ( 0 == max_block_alloc_size ) {
-
-        max_block_alloc_size = MAX_BLOCK_SIZE ;
-
-        // Upper bound of total allocation size
-        max_block_alloc_size = std::min( size_t(max_block_alloc_size)
-                                       , min_total_alloc_size );
-
-        // Lower bound of minimum block size
-        max_block_alloc_size = std::max( max_block_alloc_size
-                                       , min_block_alloc_size );
+        max_block_alloc_size = min_superblock_size ;
      }

-      if ( 0 == min_superblock_size ) {
-        min_superblock_size = max_block_alloc_size ;
+      //--------------------------------------------------

-        // Upper bound of total allocation size
-        min_superblock_size = std::min( size_t(min_superblock_size)
-                                      , min_total_alloc_size );
+      {
+        /* Enforce size constraints:
+         *   min_block_alloc_size <= max_block_alloc_size
+         *   max_block_alloc_size <= min_superblock_size 
+         *   min_superblock_size  <= max_superblock_size
+         *   min_superblock_size  <= min_total_alloc_size
+         *   min_superblock_size  <= min_block_alloc_size * 
+         *                           max_block_per_superblock
+         */

-        // Lower bound of maximum block size
-        min_superblock_size = std::max( min_superblock_size
-                                      , max_block_alloc_size );
+        const size_t max_superblock =
+          min_block_alloc_size * max_block_per_superblock ;
+
+        if ( ( size_t(max_superblock_size) < min_superblock_size ) ||
+             ( min_total_alloc_size < min_superblock_size ) ||
+             ( max_superblock       < min_superblock_size ) ||
+             ( min_superblock_size  < max_block_alloc_size ) ||
+             ( max_block_alloc_size < min_block_alloc_size ) ) {
+
+#if 1
+  printf( "  MemoryPool min_block_alloc_size(%ld) max_block_alloc_size(%ld) min_superblock_size(%ld) min_total_alloc_size(%ld) ; max_superblock_size(%ld) max_block_per_superblock(%ld)\n"
+        , min_block_alloc_size
+        , max_block_alloc_size
+        , min_superblock_size
+        , min_total_alloc_size
+        , size_t(max_superblock_size)
+        , size_t(max_block_per_superblock)
+        );
+#endif
+
+          Kokkos::abort("Kokkos MemoryPool size constraint violation");
+        }
      }

+      //--------------------------------------------------
      // Block and superblock size is power of two:
+      // Maximum value is 'max_superblock_size'

      m_min_block_size_lg2 =
        Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
@ -331,36 +379,16 @@ public:
      m_sb_size_lg2 =
        Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);

-      // Constraints:
-      // m_min_block_size_lg2 <= m_max_block_size_lg2 <= m_sb_size_lg2
-      // m_sb_size_lg2 <= m_min_block_size + max_bit_count_lg2
-
-      if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_sb_size_lg2 ) {
-        m_min_block_size_lg2 = m_sb_size_lg2 - max_bit_count_lg2 ;
-      }
-      if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_max_block_size_lg2 ) {
-        m_min_block_size_lg2 = m_max_block_size_lg2 - max_bit_count_lg2 ;
-      }
-      if ( m_max_block_size_lg2 < m_min_block_size_lg2 ) {
-        m_max_block_size_lg2 = m_min_block_size_lg2 ;
-      }
-      if ( m_sb_size_lg2 < m_max_block_size_lg2 ) {
-        m_sb_size_lg2 = m_max_block_size_lg2 ;
-      }
-
-      // At least 32 minimum size blocks in a superblock
-
-      if ( m_sb_size_lg2 < m_min_block_size_lg2 + 5 ) {
-        m_sb_size_lg2 = m_min_block_size_lg2 + 5 ;
-      }
-
+      {
        // number of superblocks is multiple of superblock size that
        // can hold min_total_alloc_size.

-      const uint32_t sb_size_mask = ( 1u << m_sb_size_lg2 ) - 1 ;
+        const uint64_t sb_size_mask = ( 1LU << m_sb_size_lg2 ) - 1 ;

        m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
+      }

+      {
        // Any superblock can be assigned to the smallest size block
        // Size the block bitset to maximum number of blocks

@ -369,6 +397,7 @@ public:

        m_sb_state_size =
          ( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
+      }

      // Array of all superblock states

@ -454,7 +483,7 @@ private:
   * Restrict lower bound to minimum block size.
   */
  KOKKOS_FORCEINLINE_FUNCTION
-  unsigned get_block_size_lg2( unsigned n ) const noexcept
+  uint32_t get_block_size_lg2( uint32_t n ) const noexcept
    {
      const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n );

@ -463,11 +492,12 @@ private:

 public:

+  /* Return 0 for invalid block size */
  KOKKOS_INLINE_FUNCTION
-  uint32_t allocate_block_size( uint32_t alloc_size ) const noexcept
+  uint32_t allocate_block_size( uint64_t alloc_size ) const noexcept
    {
      return alloc_size <= (1UL << m_max_block_size_lg2)
-           ? ( 1u << get_block_size_lg2( alloc_size ) )
+           ? ( 1UL << get_block_size_lg2( uint32_t(alloc_size) ) )
           : 0 ;
    }

@ -485,14 +515,16 @@ public:
  void * allocate( size_t alloc_size
                 , int32_t attempt_limit = 1 ) const noexcept
    {
+      if ( size_t(1LU << m_max_block_size_lg2) < alloc_size ) {
+        Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
+      }
+
      if ( 0 == alloc_size ) return (void*) 0 ;

      void * p = 0 ;

      const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );

-      if ( block_size_lg2 <= m_max_block_size_lg2 ) {
-
      // Allocation will fit within a superblock
      // that has block sizes ( 1 << block_size_lg2 )

@ -577,14 +609,24 @@ public:
            // Set the allocated block pointer

            p = ((char*)( m_sb_state_array + m_data_offset ))
-                + ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory
-                + ( result.first    << size_lg2 );     // block memory
+              + ( uint64_t(sb_id) << m_sb_size_lg2 ) // superblock memory
+              + ( uint64_t(result.first) << size_lg2 ); // block memory
+
+#if 0
+  printf( "  MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
+        , (uintptr_t)m_sb_state_array
+        , (uintptr_t)p
+        , alloc_size
+        , sb_id
+        , sb_state 
+        , (1u << size_lg2)
+        , (1u << count_lg2)
+        , result.first 
+        , result.second );
+#endif

            break ; // Success
          }
-
-// printf("  acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second );
-
        }
        //------------------------------------------------------------------
        //  Arrive here if failed to acquire a block.
@ -719,12 +761,7 @@ public:
            ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
        }
      } // end allocation attempt loop
-
      //--------------------------------------------------------------------
-      }
-      else {
-        Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
-      }

      return p ;
    }
@ -765,7 +802,7 @@ public:
        const uint32_t block_size_lg2 =
          m_sb_size_lg2 - ( block_state >> state_shift );

-        ok_block_aligned = 0 == ( d & ( ( 1 << block_size_lg2 ) - 1 ) );
+        ok_block_aligned = 0 == ( d & ( ( 1UL << block_size_lg2 ) - 1 ) );

        if ( ok_block_aligned ) {

@ -773,31 +810,70 @@ public:
          // mask into superblock and then shift down for block index

          const uint32_t bit =
-            ( d & ( ptrdiff_t( 1 << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
+            ( d & ( ptrdiff_t( 1LU << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;

          const int result =
            CB::release( sb_state_array , bit , block_state );

          ok_dealloc_once = 0 <= result ;

-// printf("  deallocate from sb_id(%d) result(%d) bit(%d) state(0x%x)\n"
-//       , sb_id
-//       , result
-//       , uint32_t(d >> block_size_lg2)
-//       , *sb_state_array );
-
+#if 0
+  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
+        , (uintptr_t)m_sb_state_array
+        , (uintptr_t)p
+        , sb_id
+        , (1u << block_size_lg2)
+        , (1u << (m_sb_size_lg2 - block_size_lg2))
+        , bit
+        , result );
+#endif
        }
      }

      if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) {
 #if 0
-        printf("Kokkos MemoryPool deallocate(0x%lx) contains(%d) block_aligned(%d) dealloc_once(%d)\n",(uintptr_t)p,ok_contains,ok_block_aligned,ok_dealloc_once);
+  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
+        , (uintptr_t)m_sb_state_array
+        , (uintptr_t)p
+        , int(ok_contains)
+        , int(ok_block_aligned)
+        , int(ok_dealloc_once) );
 #endif
        Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
      }
    }
  // end deallocate
  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int number_of_superblocks() const noexcept { return m_sb_count ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void superblock_state( int sb_id
+                       , int & block_size
+                       , int & block_count_capacity
+                       , int & block_count_used ) const noexcept
+    {
+      block_size           = 0 ;
+      block_count_capacity = 0 ;
+      block_count_used     = 0 ;
+
+      if ( Kokkos::Impl::MemorySpaceAccess
+             < Kokkos::Impl::ActiveExecutionMemorySpace
+             , base_memory_space >::accessible ) {
+       // Can access the state array
+       
+        const uint32_t state =
+          ((uint32_t volatile *)m_sb_state_array)[sb_id*m_sb_state_size];
+
+        const uint32_t block_count_lg2 = state >> state_shift ;
+        const uint32_t block_used      = state & state_used_mask ;
+
+        block_size           = 1LU << ( m_sb_size_lg2 - block_count_lg2 );
+        block_count_capacity = 1LU << block_count_lg2 ;
+        block_count_used     = block_used ;
+      }
+    }
 };

 } // namespace Kokkos 
--- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@ -97,26 +97,22 @@ typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryR
 namespace Kokkos {
 namespace Impl {

+static_assert(
+  ( 0 < int(KOKKOS_MEMORY_ALIGNMENT) ) &&
+  ( 0 == ( int(KOKKOS_MEMORY_ALIGNMENT) & (int(KOKKOS_MEMORY_ALIGNMENT)-1))) ,
+  "KOKKOS_MEMORY_ALIGNMENT must be a power of two" );
+
 /** \brief Memory alignment settings
 *
 *  Sets global value for memory alignment.  Must be a power of two!
 *  Enable compatibility of views from different devices with static stride.
 *  Use compiler flag to enable overwrites.
 */
-enum { MEMORY_ALIGNMENT =
-#if defined( KOKKOS_MEMORY_ALIGNMENT )
-    ( 1 << Kokkos::Impl::integral_power_of_two( KOKKOS_MEMORY_ALIGNMENT ) )
-#else
-    ( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
-#endif
-#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
+enum : unsigned
+  { MEMORY_ALIGNMENT           = KOKKOS_MEMORY_ALIGNMENT
  , MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
-#else
-  , MEMORY_ALIGNMENT_THRESHOLD = 4
-#endif
  };

-
 } //namespace Impl
 } // namespace Kokkos

--- a/lib/kokkos/core/src/Kokkos_ROCm.hpp
+++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp
@ -204,8 +204,8 @@ struct VerifyExecutionCanAccessMemorySpace
  >
 {
  enum { value = false };
-  inline static void verify( void ) { Experimental::ROCmSpace::access_error(); }
-  inline static void verify( const void * p ) { Experimental::ROCmSpace::access_error(p); }
+  inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
+  inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
 };
 } // namespace Experimental
 } // namespace Kokkos
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -619,16 +619,16 @@ namespace Impl {

 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType ,
-                   Kokkos::Experimental::MDRangePolicy< Traits ... > ,
+                   Kokkos::MDRangePolicy< Traits ... > ,
                   Kokkos::Serial
                 >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy Policy ;

-  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+  typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;

  const FunctorType   m_functor ;
  const MDRangePolicy m_mdr_policy ;
@ -661,14 +661,14 @@ public:

 template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
-                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , Kokkos::MDRangePolicy< Traits ... >
                    , ReducerType
                    , Kokkos::Serial
                    >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy Policy ;

  typedef typename MDRangePolicy::work_tag                                  WorkTag ;
@ -686,7 +686,7 @@ private:
  typedef typename Analysis::reference_type  reference_type ;


-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
                                                                           , FunctorType
                                                                           , WorkTag
                                                                           , ValueType
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@ -408,7 +408,7 @@ view_alloc( Args const & ... args )
 }

 template< class ... Args >
-inline
+KOKKOS_INLINE_FUNCTION
 Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
 view_wrap( Args const & ... args )
 {
@ -1216,6 +1216,13 @@ public:
      m_track.assign_allocated_record_to_uninitialized( record );
    }

+  KOKKOS_INLINE_FUNCTION
+  void assign_data( pointer_type arg_data )
+    {
+      m_track.clear();
+      m_map.assign_data( arg_data );
+    }
+
  // Wrap memory according to properties and array layout
  template< class ... P >
  explicit KOKKOS_INLINE_FUNCTION
@ -2235,6 +2242,29 @@ create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
  return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
 }

+// Create a mirror view and deep_copy in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
+  , std::string const& name = ""
+  , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  (void)name;
+  return src;
+}
+
+// Create a mirror view and deep_copy in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
+  , std::string const& name = ""
+  , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  using Mirror = typename Impl::MirrorViewType<Space,T,P ...>::view_type;
+  std::string label = name.empty() ? src.label() : name;
+  auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout());
+  deep_copy(mirror, src);
+  return mirror;
+}
+
 } /* namespace Kokkos */

 //----------------------------------------------------------------------------
@ -2432,6 +2462,7 @@ struct CommonViewAllocProp< void, ValueType >
  using scalar_array_type = ValueType;

  template < class ... Views >
+  KOKKOS_INLINE_FUNCTION
  CommonViewAllocProp( const Views & ... ) {}
 };

@ -2499,6 +2530,7 @@ using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...

 // User function
 template < class ... Views >
+KOKKOS_INLINE_FUNCTION
 DeducedCommonPropsType<Views...> 
 common_view_alloc_prop( Views const & ... views )
 {
--- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@ -46,15 +46,13 @@

 namespace Kokkos {
 namespace Impl {
-namespace Experimental {

 template< class functor_type , class execution_space, class ... policy_args >
 class WorkGraphExec;

-}}} // namespace Kokkos::Impl::Experimental
+}} // namespace Kokkos::Impl

 namespace Kokkos {
-namespace Experimental {

 template< class ... Properties >
 class WorkGraphPolicy
@ -64,187 +62,182 @@ public:
  using self_type       = WorkGraphPolicy<Properties ... >;
  using traits          = Kokkos::Impl::PolicyTraits<Properties ... >;
  using index_type      = typename traits::index_type;
-  using execution_space = typename traits::execution_space;
-  using work_tag = typename traits::work_tag;
-  using memory_space = typename execution_space::memory_space;
-  using graph_type = Kokkos::Experimental::Crs<index_type, execution_space, void, index_type>;
  using member_type     = index_type;
+  using work_tag        = typename traits::work_tag;
+  using execution_space = typename traits::execution_space;
+  using memory_space    = typename execution_space::memory_space;
+  using graph_type      = Kokkos::Crs<index_type,execution_space,void,index_type>;
+
+  enum : std::int32_t {
+    END_TOKEN       = -1 ,
+    BEGIN_TOKEN     = -2 ,
+    COMPLETED_TOKEN = -3 };

 private:

-  graph_type m_graph;
-
  using ints_type = Kokkos::View<std::int32_t*, memory_space>;
-  using range_type = Kokkos::pair<std::int32_t, std::int32_t>;
-  using ranges_type = Kokkos::View<range_type*, memory_space>;
-  const std::int32_t m_total_work;
-  ints_type m_counts;
+
+  // Let N = m_graph.numRows(), the total work
+  // m_queue[  0 ..   N-1] = the ready queue
+  // m_queue[  N .. 2*N-1] = the waiting queue counts
+  // m_queue[2*N .. 2*N+2] = the ready queue hints
+
+  graph_type const m_graph;
  ints_type        m_queue ;
-  ranges_type m_ranges;
-
-public:
-
-  struct TagZeroRanges {};
-  KOKKOS_INLINE_FUNCTION
-  void operator()(TagZeroRanges, std::int32_t i) const {
-    m_ranges[i] = range_type(0, 0);
-  }
-  void zero_ranges() {
-    using policy_type = RangePolicy<std::int32_t, execution_space, TagZeroRanges>;
-    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
-    const closure_type closure(*this, policy_type(0, 1));
-    closure.execute();
-    execution_space::fence();
-  }
-
-  struct TagFillQueue {};
-  KOKKOS_INLINE_FUNCTION
-  void operator()(TagFillQueue, std::int32_t i) const {
-    if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i);
-  }
-  void fill_queue() {
-    using policy_type = RangePolicy<std::int32_t, execution_space, TagFillQueue>;
-    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
-    const closure_type closure(*this, policy_type(0, m_total_work));
-    closure.execute();
-    execution_space::fence();
-  }
-
-private:
-
-  inline
-  void setup() {
-    if (m_graph.numRows() > std::numeric_limits<std::int32_t>::max()) {
-      Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t");
-    }
-    get_crs_transpose_counts(m_counts, m_graph);
-    m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work);
-    deep_copy(m_queue, std::int32_t(-1));
-    m_ranges = ranges_type("ranges", 1);
-    fill_queue();
-  }

  KOKKOS_INLINE_FUNCTION
-  std::int32_t pop_work() const {
-    range_type w(-1,-1);
-    while (true) {
-      const range_type w_new( w.first + 1 , w.second );
-      w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
-      if ( w.first < w.second ) { // there was work in the queue
-        if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
-          // we got a work item
-          std::int32_t i;
-          // the push_work function may have incremented the end counter
-          // but not yet written the work index into the queue.
-          // wait until the entry is valid.
-          while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) );
-          return i;
-        } // we got a work item
-      } else { // there was no work in the queue
-#ifdef KOKKOS_DEBUG
-        if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
-          Kokkos::abort("bug in pop_work");
-        }
-#endif
-        if (w.first == m_total_work) { // all work is done
-          return -1;
-        } else { // need to wait for more work to be pushed
-          // take a guess that one work item will be pushed
-          // the key thing is we can't leave (w) alone, because
-          // otherwise the next compare_exchange may succeed in
-          // popping work from an empty queue
-          w.second++;
-        }
-      } // there was no work in the queue
-    } // while (true)
+  void push_work( const std::int32_t w ) const noexcept
+    {
+      const std::int32_t N = m_graph.numRows();
+
+      std::int32_t volatile * const ready_queue = & m_queue[0] ;
+      std::int32_t volatile * const end_hint    = & m_queue[2*N+1] ;
+
+      // Push work to end of queue
+      const std::int32_t j = atomic_fetch_add( end_hint , 1 );
+
+      if ( ( N <= j ) ||
+           ( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) {
+        // ERROR: past the end of queue or did not replace END_TOKEN
+        Kokkos::abort("WorkGraphPolicy push_work error");
      }

-  KOKKOS_INLINE_FUNCTION
-  void push_work(std::int32_t i) const {
-    range_type w(-1,-1);
-    while (true) {
-      const range_type w_new( w.first , w.second + 1 );
-      // try to increment the end counter
-      w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
-      // stop trying if the increment was successful
-      if ( w.first == w_new.first && w.second + 1 == w_new.second ) break;
-    }
-    // write the work index into the claimed spot in the queue
-    *((volatile std::int32_t*)(&m_queue( w.second ))) = i;
-    // push this write out into the memory system
      memory_fence();
    }

-  template< class functor_type , class execution_space, class ... policy_args >
-  friend class Kokkos::Impl::Experimental::WorkGraphExec;
-
 public:

-  WorkGraphPolicy(graph_type arg_graph)
+  /**\brief  Attempt to pop the work item at the head of the queue.
+   *
+   *  Find entry 'i' such that
+   *    ( m_queue[i] != BEGIN_TOKEN ) AND
+   *    ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN )
+   *  if found then
+   *    increment begin hint
+   *    return atomic_exchange( m_queue[i] , BEGIN_TOKEN )
+   *  else if i < total work
+   *    return END_TOKEN
+   *  else
+   *    return COMPLETED_TOKEN
+   *  
+   */
+  KOKKOS_INLINE_FUNCTION
+  std::int32_t pop_work() const noexcept
+    {
+      const std::int32_t N = m_graph.numRows();
+
+      std::int32_t volatile * const ready_queue = & m_queue[0] ;
+      std::int32_t volatile * const begin_hint  = & m_queue[2*N] ;
+
+      // begin hint is guaranteed to be less than or equal to
+      // actual begin location in the queue.
+
+      for ( std::int32_t i = *begin_hint ; i < N ; ++i ) {
+
+        const std::int32_t w = ready_queue[i] ;
+
+        if ( w == END_TOKEN ) { return END_TOKEN ; }
+
+        if ( ( w != BEGIN_TOKEN ) &&
+             ( w == atomic_compare_exchange(ready_queue+i,w,BEGIN_TOKEN) ) ) {
+          // Attempt to claim ready work index succeeded,
+          // update the hint and return work index
+          atomic_increment( begin_hint );
+          return w ;
+        }
+        // arrive here when ready_queue[i] == BEGIN_TOKEN
+      }
+
+      return COMPLETED_TOKEN ;
+    }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void completed_work( std::int32_t w ) const noexcept
+    {
+      Kokkos::memory_fence();
+
+      // Make sure the completed work function's memory accesses are flushed.
+
+      const std::int32_t N = m_graph.numRows();
+
+      std::int32_t volatile * const count_queue = & m_queue[N] ;
+
+      const std::int32_t B = m_graph.row_map(w);
+      const std::int32_t E = m_graph.row_map(w+1);
+
+      for ( std::int32_t i = B ; i < E ; ++i ) {
+        const std::int32_t j = m_graph.entries(i);
+        if ( 1 == atomic_fetch_add(count_queue+j,-1) ) {
+          push_work(j);
+        }
+      }
+    }
+
+  struct TagInit {};
+  struct TagCount {};
+  struct TagReady {};
+
+  /**\brief  Initialize queue
+   *
+   *  m_queue[0..N-1] = END_TOKEN, the ready queue
+   *  m_queue[N..2*N-1] = 0, the waiting count queue
+   *  m_queue[2*N..2*N+1] = 0, begin/end hints for ready queue
+   */
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit , int i ) const noexcept
+    { m_queue[i] = i < m_graph.numRows() ? END_TOKEN : 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagCount , int i ) const noexcept
+    {
+      std::int32_t volatile * const count_queue =
+        & m_queue[ m_graph.numRows() ] ;
+
+      atomic_increment( count_queue + m_graph.entries[i] );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagReady , int w ) const noexcept
+    {
+      std::int32_t const * const count_queue =
+        & m_queue[ m_graph.numRows() ] ;
+
+      if ( 0 == count_queue[w] ) push_work(w);
+    }
+
+  WorkGraphPolicy( const graph_type & arg_graph )
    : m_graph(arg_graph)
-    , m_total_work( arg_graph.numRows() )
+    , m_queue( view_alloc( "queue" , WithoutInitializing )
+             , arg_graph.numRows() * 2 + 2 )
  {
-    setup();
+    { // Initialize
+      using policy_type = RangePolicy<std::int32_t, execution_space, TagInit>;
+      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+      const closure_type closure(*this, policy_type(0, m_queue.size()));
+      closure.execute();
+      execution_space::fence();
    }

-};
-
-}} // namespace Kokkos::Experimental
-
-/*--------------------------------------------------------------------------*/
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-namespace Experimental {
-
-template< class functor_type , class execution_space, class ... policy_args >
-class WorkGraphExec
-{
- public:
-
-  using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >;
-  using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >;
-  using member_type = typename policy_type::member_type;
-  using memory_space = typename execution_space::memory_space;
-
- protected:
-
-  const functor_type m_functor;
-  const policy_type  m_policy;
-
- protected:
-
-  KOKKOS_INLINE_FUNCTION
-  std::int32_t before_work() const {
-    return m_policy.pop_work();
+    { // execute-after counts
+      using policy_type = RangePolicy<std::int32_t, execution_space, TagCount>;
+      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+      const closure_type closure(*this,policy_type(0,m_graph.entries.size()));
+      closure.execute();
+      execution_space::fence();
    }

-  KOKKOS_INLINE_FUNCTION
-  void after_work(std::int32_t i) const {
-    /* fence any writes that were done by the work item itself
-       (usually writing its result to global memory) */
-    memory_fence();
-    const std::int32_t begin = m_policy.m_graph.row_map( i );
-    const std::int32_t end = m_policy.m_graph.row_map( i + 1 );
-    for (std::int32_t j = begin; j < end; ++j) {
-      const std::int32_t next = m_policy.m_graph.entries( j );
-      const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 );
-      if ( old_count == 1 )  m_policy.push_work( next );
+    { // Scheduling ready tasks
+      using policy_type = RangePolicy<std::int32_t, execution_space, TagReady>;
+      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+      const closure_type closure(*this,policy_type(0,m_graph.numRows()));
+      closure.execute();
+      execution_space::fence();
    }
  }
-
-  inline
-  WorkGraphExec( const functor_type & arg_functor
-               , const policy_type  & arg_policy )
-    : m_functor( arg_functor )
-    , m_policy(  arg_policy )
-  {
-  }
 };

-}}} // namespace Kokkos::Impl::Experimental
+} // namespace Kokkos

 #ifdef KOKKOS_ENABLE_SERIAL
 #include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@ -294,7 +294,7 @@ void OpenMP::initialize( int thread_count )
  }

  {
-    if (nullptr == std::getenv("OMP_PROC_BIND") ) {
+    if ( Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND") ) {
      printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
      printf("  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
      printf("  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
@ -327,7 +327,7 @@ void OpenMP::initialize( int thread_count )
      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
    }
    else {
-      if( thread_count > process_num_threads ) {
+      if( Kokkos::show_warnings() && thread_count > process_num_threads ) {
        printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
        printf( "  process threads available : %3d,  requested thread : %3d\n", process_num_threads, thread_count );
      }
@ -364,12 +364,12 @@ void OpenMP::initialize( int thread_count )


  // Check for over-subscription
-  //if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
-  //  std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
-  //  std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
-  //  std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
-  //  std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
-  //}
+  if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
+    std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
+    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
+    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
+    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
+  }
  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();

--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@ -170,20 +170,20 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType
-                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::MDRangePolicy< Traits ... >
                 , Kokkos::OpenMP
                 >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy         Policy ;
  typedef typename MDRangePolicy::work_tag                  WorkTag ;

  typedef typename Policy::WorkRange    WorkRange ;
  typedef typename Policy::member_type  Member ;

-  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+  typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;

        OpenMPExec   * m_instance ;
  const FunctorType   m_functor ;
@ -445,14 +445,14 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
-                    , Kokkos::Experimental::MDRangePolicy< Traits ...>
+                    , Kokkos::MDRangePolicy< Traits ...>
                    , ReducerType
                    , Kokkos::OpenMP
                    >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy         Policy ;

  typedef typename MDRangePolicy::work_tag                  WorkTag ;
@ -472,7 +472,7 @@ private:
  typedef typename Analysis::pointer_type    pointer_type ;
  typedef typename Analysis::reference_type  reference_type ;

-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
                                                                           , FunctorType
                                                                           , WorkTag
                                                                           , ValueType
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
@ -49,33 +49,26 @@ namespace Impl {

 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType ,
-                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::WorkGraphPolicy< Traits ... > ,
                   Kokkos::OpenMP
                 >
-  : public Kokkos::Impl::Experimental::
-           WorkGraphExec< FunctorType,
-                          Kokkos::OpenMP,
-                          Traits ...
-                        >
 {
 private:

-  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
-  typedef Kokkos::Impl::Experimental::
-          WorkGraphExec<FunctorType, Kokkos::OpenMP, Traits ... > Base ;
+  typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
+
+  Policy       m_policy ;
+  FunctorType  m_functor ;

  template< class TagType >
  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    Base::m_functor( i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }

  template< class TagType >
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    const TagType t{} ;
-    Base::m_functor( t , i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{} ; m_functor( t , w ); }

 public:

@ -86,9 +79,15 @@ public:

    #pragma omp parallel num_threads(pool_size)
    {
-      for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
-        exec_one< typename Policy::work_tag >( i );
-        Base::after_work(i);
+      // Spin until COMPLETED_TOKEN.
+      // END_TOKEN indicates no work is currently available.
+
+      for ( std::int32_t w = Policy::END_TOKEN ;
+            Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+        if ( Policy::END_TOKEN != w ) {
+          exec_one< typename Policy::work_tag >( w );
+          m_policy.completed_work(w);
+        }
      }
    }
  }
@ -96,12 +95,13 @@ public:
  inline
  ParallelFor( const FunctorType & arg_functor
             , const Policy      & arg_policy )
-    : Base( arg_functor, arg_policy )
-  {
-  }
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+  {}
 };

 } // namespace Impl
 } // namespace Kokkos

 #endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */
+
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
@ -125,7 +125,7 @@ namespace Kokkos {
    oldval.t = *dest ;
    assume.i = oldval.i ;
    newval.t = val ;
-    atomic_compare_exchange( reinterpret_cast<int*>(dest) , assume.i, newval.i );
+    atomic_compare_exchange( (int*)(dest) , assume.i, newval.i );

    return oldval.t ;    
  }
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
@ -608,6 +608,7 @@ ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size )

 void ROCmInternal::finalize()
 {
+  Kokkos::Impl::rocm_device_synchronize();
  was_finalized = 1;
  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {

--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
@ -277,7 +277,7 @@ public:
      this->team_barrier();
      value = local_value;
    }
-// Reduce accross a team of threads.
+// Reduce across a team of threads.
 //
 // Each thread has vector_length elements.
 // This reduction is for TeamThreadRange operations, where the range
@ -354,6 +354,80 @@ public:
      return buffer[0];
    }

+// Reduce across a team of threads, with a reducer data type
+//
+// Each thread has vector_length elements.
+// This reduction is for TeamThreadRange operations, where the range
+// is spread across threads.  Effectively, there are vector_length
+// independent reduction operations.
+// This is different from a reduction across the elements of a thread,
+// which reduces every vector element.
+
+    template< class ReducerType >
+    KOKKOS_INLINE_FUNCTION
+    typename std::enable_if< is_reducer< ReducerType >::value >::type
+    team_reduce( const ReducerType & reducer) const
+    {
+      typedef typename ReducerType::value_type value_type ;
+
+      tile_static value_type buffer[512];
+      const auto local = lindex();
+      const auto team  = team_rank();
+      auto vector_rank = local%m_vector_length;
+      auto thread_base = team*m_vector_length;
+
+      const std::size_t size = next_pow_2(m_team_size+1)/2;
+#if defined(ROCM15)
+      buffer[local] = reducer.reference();
+#else
+        // ROCM 1.5 handles address spaces better, previous version didn't
+      lds_for(buffer[local], [&](ValueType& x)
+      {
+          x = value;
+      });
+#endif
+      m_idx.barrier.wait();
+
+      for(std::size_t s = 1; s < size; s *= 2)
+      {
+          const std::size_t index = 2 * s * team;
+          if (index < size)
+          {
+#if defined(ROCM15)
+                reducer.join(buffer[vector_rank+index*m_vector_length],
+                        buffer[vector_rank+(index+s)*m_vector_length]);
+#else
+              lds_for(buffer[vector_rank+index*m_vector_length], [&](ValueType& x)
+              {
+                  lds_for(buffer[vector_rank+(index+s)*m_vector_length],
+                                [&](ValueType& y)
+                  {
+                      reducer.join(x, y);
+                  });
+              });
+#endif
+          }
+          m_idx.barrier.wait();
+      }
+
+      if (local == 0)
+      {
+          for(int i=size*m_vector_length; i<m_team_size*m_vector_length; i+=m_vector_length)
+#if defined(ROCM15)
+              reducer.join(buffer[vector_rank], buffer[vector_rank+i]);
+#else
+              lds_for(buffer[vector_rank], [&](ValueType& x)
+              {
+                  lds_for(buffer[vector_rank+i],
+                                [&](ValueType& y)
+                  {
+                      reducer.join(x, y);
+                  });
+              });
+#endif
+      }
+      m_idx.barrier.wait();
+    }

    /** \brief  Intra-team vector reduce 
     *          with intra-team non-deterministic ordering accumulation.
@ -406,6 +480,33 @@ public:
      return buffer[thread_base];
    }

+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  vector_reduce( ReducerType const & reducer )
+    {
+      #ifdef __HCC_ACCELERATOR__
+      if(blockDim_x == 1) return;
+
+      // Intra vector lane shuffle reduction:
+      typename ReducerType::value_type tmp ( reducer.reference() );
+
+      for ( int i = blockDim_x ; ( i >>= 1 ) ; ) {
+        shfl_down( reducer.reference() , i , blockDim_x );
+        if ( (int)threadIdx_x < i ) { reducer.join( tmp , reducer.reference() ); }
+      }
+
+      // Broadcast from root lane to all other lanes.
+      // Cannot use "butterfly" algorithm to avoid the broadcast
+      // because floating point summation is not associative
+      // and thus different threads could have different results.
+
+      shfl( reducer.reference() , 0 , blockDim_x );
+      #endif
+    }
+
+
+
    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
     *          with intra-team non-deterministic ordering accumulation.
     *
@ -1075,6 +1176,22 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROC
 //               Impl::JoinAdd<ValueType>());
 }

+/** \brief  Inter-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ReducerType const & reducer) {
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+  loop_boundaries.thread.team_reduce(reducer);
+}
+
 /** \brief  Intra-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
 *
 * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
@ -1161,6 +1278,41 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::R
  result = loop_boundaries.thread.thread_reduce(result,join);
 }

+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
+      loop_boundaries, const Lambda & lambda, ReducerType const & reducer) {
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+  loop_boundaries.thread.vector_reduce(reducer);
+}
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ReducerType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ReducerType const & reducer) {
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());  
+    loop_boundaries.thread.team_barrier();
+  }
+  reducer.reference() = loop_boundaries.thread.thread_reduce(reducer.reference(),join);
+}
+
 /** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
 *          for each i=0..N-1.
 *
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@ -266,7 +266,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );

  for ( int i = 0 ; i < n ; ++i ) {
-    Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+    Impl::spinwait_while_equal<int>( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
  }

  exec.m_pool_state = ThreadsExec::Inactive ;
@ -310,7 +310,7 @@ void ThreadsExec::fence()
 {
  if ( s_thread_pool_size[0] ) {
    // Wait for the root thread to complete:
-    Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+    Impl::spinwait_while_equal<int>( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
  }

  s_current_function     = 0 ;
@ -716,12 +716,12 @@ void ThreadsExec::initialize( unsigned thread_count ,
  }

  // Check for over-subscription
-  //if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
-  //  std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
-  //  std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
-  //  std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
-  //  std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
-  //}
+  if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
+    std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
+    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
+    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
+    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
+  }

  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@ -50,6 +50,7 @@
 #include <cstdio>

 #include <utility>
+#include <cstdalign>
 #include <impl/Kokkos_Spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>

@ -107,7 +108,7 @@ private:
  // Which thread am I stealing from currently
  int m_current_steal_target;
  // This thread's owned work_range
-  Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN(16);
+  Kokkos::pair<long,long> m_work_range __attribute__((aligned(16))) ;
  // Team Offset if one thread determines work_range for others
  long m_team_work_index;

@ -191,13 +192,13 @@ public:
      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
-        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
-        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the reduction and broadcast
@ -233,13 +234,13 @@ public:
      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
-        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
-        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the reduction and broadcast
@ -268,7 +269,7 @@ public:

        ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;

-        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );

        Join::join( f , reduce_memory() , fan.reduce_memory() );
      }
@ -295,7 +296,7 @@ public:
      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );

      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
-        Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
      }
    }

@ -327,7 +328,7 @@ public:
        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];

        // Wait: Active -> ReductionAvailable (or ScanAvailable)
-        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );
        Join::join( f , work_value , fan.reduce_memory() );
      }

@ -345,8 +346,8 @@ public:

          // Wait: Active             -> ReductionAvailable
          // Wait: ReductionAvailable -> ScanAvailable
-          Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active );
-          Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable );
+          Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::ReductionAvailable );

          Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
        }
@ -357,7 +358,7 @@ public:

        // Wait for all threads to complete inclusive scan
        // Wait: ScanAvailable -> Rendezvous
-        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable );
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanAvailable );
      }

      //--------------------------------
@ -365,7 +366,7 @@ public:
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
        // Wait: ReductionAvailable -> ScanAvailable
-        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::ReductionAvailable );
        // Set: ScanAvailable -> Rendezvous
        fan.m_pool_state = ThreadsExec::Rendezvous ;
      }
@ -392,13 +393,13 @@ public:
      // Wait for all threads to copy previous thread's inclusive scan value
      // Wait for all threads: Rendezvous -> ScanCompleted
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
-        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
      }
      if ( rev_rank ) {
        // Set: ScanAvailable -> ScanCompleted
        m_pool_state = ThreadsExec::ScanCompleted ;
        // Wait: ScanCompleted -> Active
-        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted );
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanCompleted );
      }
      // Set: ScanCompleted -> Active
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
@ -425,7 +426,7 @@ public:
      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
-        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
@ -433,7 +434,7 @@ public:
      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
-        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the thread-scan before releasing threads
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@ -107,13 +107,13 @@ public:

      // Wait for fan-in threads
      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
-        Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( m_team_base[j]->state() , ThreadsExec::Active );
      }

      // If not root then wait for release
      if ( m_team_rank_rev ) {
        m_exec->state() = ThreadsExec::Rendezvous ;
-        Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal<int>( m_exec->state() , ThreadsExec::Rendezvous );
      }

      return ! m_team_rank_rev ;
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@ -180,12 +180,12 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType
-                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::MDRangePolicy< Traits ... >
                 , Kokkos::Threads
                 >
 {
 private:
-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy         Policy ;

  typedef typename MDRangePolicy::work_tag                  WorkTag ;
@ -193,7 +193,7 @@ private:
  typedef typename Policy::WorkRange   WorkRange ;
  typedef typename Policy::member_type Member ;

-  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+  typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;

  const FunctorType   m_functor ;
  const MDRangePolicy m_mdr_policy ;
@ -548,14 +548,14 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
-                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , Kokkos::MDRangePolicy< Traits ... >
                    , ReducerType
                    , Kokkos::Threads
                    >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy Policy ;

  typedef typename MDRangePolicy::work_tag    WorkTag ;
@ -573,7 +573,7 @@ private:
  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;

-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
                                                                           , FunctorType
                                                                           , WorkTag
                                                                           , ValueType
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
@ -49,45 +49,47 @@ namespace Impl {

 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType ,
-                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::WorkGraphPolicy< Traits ... > ,
                   Kokkos::Threads
                 >
-  : public Kokkos::Impl::Experimental::
-           WorkGraphExec< FunctorType,
-                          Kokkos::Threads,
-                          Traits ...
-                        >
 {
 private:

-  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
-  typedef Kokkos::Impl::Experimental::
-          WorkGraphExec<FunctorType, Kokkos::Threads, Traits ... > Base ;
+  typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
+
  typedef ParallelFor<FunctorType,
-                      Kokkos::Experimental::WorkGraphPolicy<Traits ...>,
+                      Kokkos::WorkGraphPolicy<Traits ...>,
                      Kokkos::Threads> Self ;

+  Policy       m_policy ;
+  FunctorType  m_functor ;
+
  template< class TagType >
  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    Base::m_functor( i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }

  template< class TagType >
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    const TagType t{} ;
-    Base::m_functor( t , i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{}; m_functor( t , w ); }

-  inline void exec_one_thread() const {
-    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
-      exec_one< typename Policy::work_tag >( i );
-      Base::after_work(i);
+  inline void exec_one_thread() const noexcept 
+    {
+      // Spin until COMPLETED_TOKEN.
+      // END_TOKEN indicates no work is currently available.
+      
+      for ( std::int32_t w = Policy::END_TOKEN ;
+            Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+        if ( Policy::END_TOKEN != w ) {
+          exec_one< typename Policy::work_tag >( w );
+          m_policy.completed_work(w);
+        }
      }
    }

-  static inline void thread_main( ThreadsExec&, const void* arg ) {
+  static inline void thread_main( ThreadsExec&, const void* arg ) noexcept
+    {
      const Self& self = *(static_cast<const Self*>(arg));
      self.exec_one_thread();
    }
@ -104,9 +106,9 @@ public:
  inline
  ParallelFor( const FunctorType & arg_functor
             , const Policy      & arg_policy )
-    : Base( arg_functor, arg_policy )
-  {
-  }
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+    {}
 };

 } // namespace Impl
--- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@ -59,7 +59,7 @@
 #include <algorithm>
 #include <cstdio>

-namespace Kokkos { namespace Experimental { namespace Impl {
+namespace Kokkos { namespace Impl {

 // Temporary, for testing new loop macros
 #define KOKKOS_ENABLE_NEW_LOOP_MACROS 1
@ -1274,7 +1274,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i


 template <typename T>
-using is_void = std::is_same< T , void >;
+using is_void_type = std::is_same< T , void >;

 template <typename T>
 struct is_type_array : std::false_type 
@ -1303,7 +1303,7 @@ template < typename RP
         , typename Tag
         , typename ValueType
         >
-struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void<ValueType >::value >::type >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void_type<ValueType >::value >::type >
 {
  using index_type = typename RP::index_type;
  using point_type = typename RP::point_type;
@ -1781,7 +1781,7 @@ template < typename RP
         , typename Tag
         , typename ValueType
         >
-struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && !is_type_array<ValueType>::value >::type >
 {
  using index_type = typename RP::index_type;
  using point_type = typename RP::point_type;
@ -2268,7 +2268,7 @@ template < typename RP
         , typename Tag
         , typename ValueType
         >
-struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && is_type_array<ValueType>::value >::type >
 {
  using index_type = typename RP::index_type;
  using point_type = typename RP::point_type;
@ -2750,6 +2750,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
 // Cuda uses DeviceIterateTile directly within md_parallel_for
 // TODO Once md_parallel_{for,reduce} removed, this can be removed

+namespace Experimental { 
+
 // ParallelReduce - scalar reductions
 template < typename MDRange, typename Functor, typename ValueType = void >
 struct MDFunctor
@ -2759,7 +2761,7 @@ struct MDFunctor
  using value_type   = ValueType;
  using work_tag     = typename range_policy::work_tag;
  using index_type   = typename range_policy::index_type;
-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
                                                             , Functor
                                                             , work_tag
                                                             , value_type
@ -2804,7 +2806,7 @@ struct MDFunctor< MDRange, Functor, ValueType[] >
  using value_type   = ValueType[];
  using work_tag     = typename range_policy::work_tag;
  using index_type   = typename range_policy::index_type;
-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
                                                             , Functor
                                                             , work_tag
                                                             , value_type
@ -2852,7 +2854,7 @@ struct MDFunctor< MDRange, Functor, void >
  using functor_type = Functor;
  using work_tag     = typename range_policy::work_tag;
  using index_type   = typename range_policy::index_type;
-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
                                                             , Functor
                                                             , work_tag
                                                             , void
@ -2887,8 +2889,9 @@ struct MDFunctor< MDRange, Functor, void >
  Functor m_func;
 };

+} // end namespace Experimental
 #undef KOKKOS_ENABLE_NEW_LOOP_MACROS

-} } } //end namespace Kokkos::Experimental::Impl
+} } //end namespace Kokkos::Impl

 #endif
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -51,9 +51,12 @@

 //----------------------------------------------------------------------------

-namespace Kokkos {
-namespace Impl {
 namespace {
+bool g_is_initialized = false;
+bool g_show_warnings = true;
+}
+
+namespace Kokkos { namespace Impl { namespace {

 bool is_unsigned_int(const char* str)
 {
@ -75,6 +78,10 @@ void initialize_internal(const InitArguments& args)
 setenv("MEMKIND_HBW_NODES", "1", 0);
 #endif

+  if (args.disable_warnings) {
+    g_show_warnings = false;
+  }
+
  // Protect declarations, to prevent "unused variable" warnings.
 #if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMPTARGET )
  const int num_threads = args.num_threads;
@ -177,6 +184,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
 #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
 #endif
+    g_is_initialized = true;
 }

 void finalize_internal( const bool all_spaces = false )
@ -233,6 +241,9 @@ void finalize_internal( const bool all_spaces = false )
      Kokkos::Serial::finalize();
  }
 #endif
+
+  g_is_initialized = false;
+  g_show_warnings = true;
 }

 void fence_internal()
@ -306,9 +317,7 @@ bool check_int_arg(char const* arg, char const* expected, int* value) {
  return true;
 }

-} // namespace
-} // namespace Impl
-} // namespace Kokkos
+}}} // namespace Kokkos::Impl::{unnamed}

 //----------------------------------------------------------------------------

@ -319,6 +328,7 @@ void initialize(int& narg, char* arg[])
    int num_threads = -1;
    int numa = -1;
    int device = -1;
+    bool disable_warnings = false;

    int kokkos_threads_found = 0;
    int kokkos_numa_found = 0;
@ -415,6 +425,12 @@ void initialize(int& narg, char* arg[])
        } else {
          iarg++;
        }
+      } else if ( strcmp(arg[iarg],"--kokkos-disable-warnings") == 0) {
+        disable_warnings = true;
+        for(int k=iarg;k<narg-1;k++) {
+          arg[k] = arg[k+1];
+        }
+        narg--;
      } else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
         std::cout << std::endl;
         std::cout << "--------------------------------------------------------------------------------" << std::endl;
@ -427,6 +443,7 @@ void initialize(int& narg, char* arg[])
         std::cout << "settings." << std::endl;
         std::cout << std::endl;
         std::cout << "--kokkos-help               : print this message" << std::endl;
+         std::cout << "--kokkos-disable-warnings   : disable kokkos warning messages" << std::endl;
         std::cout << "--kokkos-threads=INT        : specify total number of threads or" << std::endl;
         std::cout << "                              number of threads per NUMA region if " << std::endl;
         std::cout << "                              used in conjunction with '--numa' option. " << std::endl;
@ -457,7 +474,7 @@ void initialize(int& narg, char* arg[])
      iarg++;
    }

-    InitArguments arguments{num_threads, numa, device};
+    InitArguments arguments{num_threads, numa, device, disable_warnings};
    Impl::initialize_internal(arguments);
 }

@ -787,5 +804,9 @@ void print_configuration( std::ostream & out , const bool detail )
  out << msg.str() << std::endl;
 }

+bool is_initialized() noexcept { return g_is_initialized; }
+
+bool show_warnings() noexcept { return g_show_warnings; }
+
 } // namespace Kokkos

--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@ -476,7 +476,7 @@ template< class FunctorType , class ArgTag , class T , class Enable >
 struct FunctorValueInit< FunctorType , ArgTag , T & , Enable >
 {
  KOKKOS_FORCEINLINE_FUNCTION static
-  T & init( const FunctorType & f , void * p )
+  T & init( const FunctorType & , void * p )
    { return *( new(p) T() ); };
 };

--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@ -254,7 +254,12 @@ void * HostSpace::allocate( const size_t arg_alloc_size ) const
 }


-void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
+void HostSpace::deallocate( void * const arg_alloc_ptr
+    , const size_t
+#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
+    arg_alloc_size
+#endif
+    ) const
 {
  if ( arg_alloc_ptr ) {

@ -409,7 +414,7 @@ SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr

 // Iterate records to print orphaned memory ...
 void SharedAllocationRecord< Kokkos::HostSpace , void >::
-print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail )
+print_records( std::ostream & s , const Kokkos::HostSpace & , bool detail )
 {
  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail );
 }
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@ -44,6 +44,9 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE_HPP )
 #define KOKKOS_MEMORY_FENCE_HPP
+
+#include <atomic>
+
 namespace Kokkos {

 //----------------------------------------------------------------------------
@ -53,23 +56,8 @@ void memory_fence()
 {
 #if defined( __CUDA_ARCH__ )
  __threadfence();
-#elif defined( KOKKOS_ENABLE_ROCM_ATOMICS )
-  amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
-#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
-  asm volatile (
-	  "mfence" ::: "memory"
-  );
-#elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
-      ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
-  __sync_synchronize();
-#elif defined( KOKKOS_ENABLE_INTEL_ATOMICS )
-  _mm_mfence();
-#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
-  #pragma omp flush
-#elif defined( KOKKOS_ENABLE_WINDOWS_ATOMICS )
-  MemoryBarrier();
 #else
- #error "Error: memory_fence() not defined"
+  std::atomic_thread_fence( std::memory_order_seq_cst );
 #endif
 }

@ -81,12 +69,10 @@ void memory_fence()
 KOKKOS_FORCEINLINE_FUNCTION
 void store_fence()
 {
-#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
-  asm volatile (
-	  "sfence" ::: "memory"
-  );
+#if defined( __CUDA_ARCH__ )
+  __threadfence();
 #else
-  memory_fence();
+  std::atomic_thread_fence( std::memory_order_seq_cst );
 #endif
 }

@ -98,12 +84,10 @@ void store_fence()
 KOKKOS_FORCEINLINE_FUNCTION
 void load_fence()
 {
-#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
-  asm volatile (
-	  "lfence" ::: "memory"
-  );
+#if defined( __CUDA_ARCH__ )
+  __threadfence();
 #else
-  memory_fence();
+  std::atomic_thread_fence( std::memory_order_seq_cst );
 #endif
 }

--- a/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
@ -49,51 +49,50 @@ namespace Impl {

 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType ,
-                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::WorkGraphPolicy< Traits ... > ,
                   Kokkos::Serial
                 >
-  : public Kokkos::Impl::Experimental::
-           WorkGraphExec< FunctorType,
-                          Kokkos::Serial,
-                          Traits ...
-                        >
 {
 private:

-  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
-  typedef Kokkos::Impl::Experimental::
-          WorkGraphExec<FunctorType, Kokkos::Serial, Traits ... > Base ;
+  typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
+
+  Policy       m_policy ;
+  FunctorType  m_functor ;

  template< class TagType >
  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    Base::m_functor( i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }

  template< class TagType >
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    const TagType t{} ;
-    Base::m_functor( t , i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{}; m_functor( t , w ); }

 public:

  inline
-  void execute()
+  void execute() const noexcept
    {
-    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
-      exec_one< typename Policy::work_tag >( i );
-      Base::after_work(i);
+      // Spin until COMPLETED_TOKEN.
+      // END_TOKEN indicates no work is currently available.
+        
+      for ( std::int32_t w = Policy::END_TOKEN ;
+            Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+        if ( Policy::END_TOKEN != w ) {
+          exec_one< typename Policy::work_tag >( w );
+          m_policy.completed_work(w);
+        }
      }
    }

  inline
  ParallelFor( const FunctorType & arg_functor
             , const Policy      & arg_policy )
-    : Base( arg_functor, arg_policy )
-  {
-  }
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+    {}
 };

 } // namespace Impl
--- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@ -306,7 +306,7 @@ print_host_accessible_records( std::ostream & s
              , reinterpret_cast<uintptr_t>( r->m_dealloc )
              , r->m_alloc_ptr->m_label
              );
-      std::cout << buffer ;
+      s << buffer ;
      r = r->m_next ;
    } while ( r != root );
  }
@ -334,7 +334,7 @@ print_host_accessible_records( std::ostream & s
      else {
        snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name );
      }
-      std::cout << buffer ;
+      s << buffer ;
      r = r->m_next ;
    } while ( r != root );
  }
--- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
@ -294,9 +294,13 @@ public:

  template< class MemorySpace >
  constexpr
-  SharedAllocationRecord< MemorySpace , void > &
-  get_record() const
-    { return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
+  SharedAllocationRecord< MemorySpace , void > *
+  get_record() const noexcept
+    {
+      return ( m_record_bits & DO_NOT_DEREF_FLAG )
+             ? (SharedAllocationRecord< MemorySpace,void>*) 0
+             : static_cast<SharedAllocationRecord<MemorySpace,void>*>(m_record);
+    }

  template< class MemorySpace >
  std::string get_label() const
@ -323,6 +327,16 @@ public:
    return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0;
  }

+  KOKKOS_FORCEINLINE_FUNCTION
+  void clear()
+    {
+      // If this is tracking then must decrement
+      KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT
+      // Reset to default constructed value.
+      m_record_bits = DO_NOT_DEREF_FLAG ;
+    }
+
+  // Copy:
  KOKKOS_FORCEINLINE_FUNCTION
  ~SharedAllocationTracker()
    { KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT }
--- a/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp
@ -48,7 +48,7 @@
 #include <impl/Kokkos_Spinwait.hpp>
 #include <impl/Kokkos_BitOps.hpp>

-#if defined( KOKKOS_ENABLE_STDTHREAD )
+#if defined( KOKKOS_ENABLE_STDTHREAD) || defined( _WIN32 )
  #include <thread>
 #elif !defined( _WIN32 )
  #include <sched.h>
@ -63,9 +63,8 @@

 namespace Kokkos {
 namespace Impl {
-namespace {

-void host_thread_yield( const uint32_t i , const int force_yield )
+void host_thread_yield( const uint32_t i , const WaitMode mode )
 {
  static constexpr uint32_t sleep_limit = 1 << 13 ;
  static constexpr uint32_t yield_limit = 1 << 12 ;
@ -76,28 +75,26 @@ void host_thread_yield( const uint32_t i , const int force_yield )

    // Attempt to put the thread to sleep for 'c' milliseconds

-    #if defined( KOKKOS_ENABLE_STDTHREAD )
-      std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) )
-    #elif !defined( _WIN32 )
+    #if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
+      auto start = std::chrono::high_resolution_clock::now();
+      std::this_thread::yield();
+      std::this_thread::sleep_until( start + std::chrono::nanoseconds( c * 1000 ) );
+    #else
      timespec req ;
      req.tv_sec  = 0 ;
      req.tv_nsec = 1000 * c ;
      nanosleep( &req, nullptr );
-    #else /* defined( _WIN32 ) IS Microsoft Windows */
-      Sleep(c);
    #endif
  }

-  else if ( force_yield || yield_limit < i ) {
+  else if ( mode == WaitMode::PASSIVE || yield_limit < i ) {

    // Attempt to yield thread resources to runtime

-    #if defined( KOKKOS_ENABLE_STDTHREAD )
+    #if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
      std::this_thread::yield();
-    #elif !defined( _WIN32 )
+    #else
      sched_yield();
-    #else /* defined( _WIN32 ) IS Microsoft Windows */
-      YieldProcessor();
    #endif
  }

@ -138,71 +135,7 @@ void host_thread_yield( const uint32_t i , const int force_yield )
  #endif /* defined( KOKKOS_ENABLE_ASM ) */
 }

-}}} // namespace Kokkos::Impl::{anonymous}
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
-  Kokkos::load_fence();
-}
-
-void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
-  Kokkos::load_fence();
-}
-
-void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
-  Kokkos::load_fence();
-}
-
-void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
-  Kokkos::load_fence();
-}
-
-void yield_while_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
-  Kokkos::load_fence();
-}
-
-void yield_until_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
-  Kokkos::load_fence();
-}
-
-void yield_while_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
-  Kokkos::load_fence();
-}
-
-void yield_until_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
-  Kokkos::load_fence();
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
+}} // namespace Kokkos::Impl

 #else
 void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
--- a/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp
@ -46,47 +46,95 @@
 #define KOKKOS_SPINWAIT_HPP

 #include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>

 #include <cstdint>

+#include <type_traits>
+
 namespace Kokkos {
 namespace Impl {

 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )

-void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
-void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
+enum class WaitMode : int {
+    ACTIVE   // Used for tight loops to keep threads active longest
+  , PASSIVE  // Used to quickly yield the thread to quite down the system
+};

-void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
-void spinwait_until_equal( volatile int64_t & flag , const int64_t value );

-void yield_while_equal( volatile int32_t & flag , const int32_t value );
-void yield_until_equal( volatile int32_t & flag , const int32_t value );
+void host_thread_yield( const uint32_t i , const WaitMode mode );

-void yield_while_equal( volatile int64_t & flag , const int64_t value );
-void yield_until_equal( volatile int64_t & flag , const int64_t value );
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+spinwait_while_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value == flag ) {
+    host_thread_yield(++i, WaitMode::ACTIVE);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+yield_while_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value == flag ) {
+    host_thread_yield(++i, WaitMode::PASSIVE);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+spinwait_until_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value != flag ) {
+    host_thread_yield(++i, WaitMode::ACTIVE);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+yield_until_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value != flag ) {
+    host_thread_yield(++i, WaitMode::PASSIVE);
+  }
+  Kokkos::load_fence();
+}

 #else

+template <typename T>
 KOKKOS_INLINE_FUNCTION
-void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
-KOKKOS_INLINE_FUNCTION
-void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
+typename std::enable_if< std::is_integral<T>::value, void>::type
+spinwait_while_equal( T const volatile & flag, const T value ) {}

+template <typename T>
 KOKKOS_INLINE_FUNCTION
-void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
-KOKKOS_INLINE_FUNCTION
-void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
+typename std::enable_if< std::is_integral<T>::value, void>::type
+yield_while_equal( T const volatile & flag, const T value ) {}

+template <typename T>
 KOKKOS_INLINE_FUNCTION
-void yield_while_equal( volatile int32_t & , const int32_t ) {}
-KOKKOS_INLINE_FUNCTION
-void yield_until_equal( volatile int32_t & , const int32_t ) {}
+typename std::enable_if< std::is_integral<T>::value, void>::type
+spinwait_until_equal( T const volatile & flag, const T value ) {}

+template <typename T>
 KOKKOS_INLINE_FUNCTION
-void yield_while_equal( volatile int64_t & , const int64_t ) {}
-KOKKOS_INLINE_FUNCTION
-void yield_until_equal( volatile int64_t & , const int64_t ) {}
+typename std::enable_if< std::is_integral<T>::value, void>::type
+yield_until_equal( T const volatile & flag, const T value ) {}

 #endif

--- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@ -111,7 +111,9 @@ struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> >

  using type = CommonViewAllocProp<Specialize,T> ;

+  KOKKOS_INLINE_FUNCTION
  ViewCtorProp( const type & arg ) : value( arg ) {}
+  KOKKOS_INLINE_FUNCTION
  ViewCtorProp( type && arg ) : value( arg ) {}

  type value ;
@ -128,6 +130,7 @@ struct ViewCtorProp< void , std::integral_constant<unsigned,I> >
  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;

  template< typename P >
+  KOKKOS_INLINE_FUNCTION
  ViewCtorProp( const P & ) {}
 };

--- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@ -369,9 +369,9 @@ private:

  template< size_t ... DimArgs >
  KOKKOS_FORCEINLINE_FUNCTION
-  bool set( unsigned domain_rank
-          , unsigned range_rank
-          , const ViewDimension< DimArgs ... > & dim )
+  bool set( unsigned
+          , unsigned
+          , const ViewDimension< DimArgs ... > & )
    { return true ; }

  template< class T , size_t ... DimArgs , class ... Args >
@ -1047,7 +1047,7 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
  template< class DimRHS >
  KOKKOS_INLINE_FUNCTION
  constexpr ViewOffset(
-    const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ,
+    const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & ,
    const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub )
    : m_dim( sub.range_extent(0), 0, 0, 0, 0, 0, 0, 0 )
    {
@ -1252,7 +1252,7 @@ public:
  template< unsigned TrivialScalarSize >
  KOKKOS_INLINE_FUNCTION
  constexpr ViewOffset
-    ( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
+    ( std::integral_constant<unsigned,TrivialScalarSize> const &
    , Kokkos::LayoutLeft const & arg_layout
    )
    : m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
@ -1741,7 +1741,7 @@ public:
  template< unsigned TrivialScalarSize >
  KOKKOS_INLINE_FUNCTION
  constexpr ViewOffset
-    ( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
+    ( std::integral_constant<unsigned,TrivialScalarSize> const &
    , Kokkos::LayoutRight const & arg_layout
    )
    : m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
@ -2368,7 +2368,7 @@ struct ViewDataHandle< Traits ,
                          )>::type >
 {
  typedef typename Traits::value_type  value_type ;
-  typedef typename Traits::value_type * KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ;
+  typedef typename Traits::value_type * KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
  typedef typename Traits::value_type & return_type ;
  typedef Kokkos::Impl::SharedAllocationTracker  track_type  ;

@ -2376,7 +2376,7 @@ struct ViewDataHandle< Traits ,
  static handle_type assign( value_type * arg_data_ptr
                           , track_type const & /*arg_tracker*/ )
  {
-    if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
    }
    return handle_type( arg_data_ptr );
@ -2386,7 +2386,7 @@ struct ViewDataHandle< Traits ,
  static handle_type assign( handle_type const arg_data_ptr
                           , size_t offset )
  {
-    if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
    }
    return handle_type( arg_data_ptr + offset );
@ -2411,7 +2411,7 @@ struct ViewDataHandle< Traits ,
                          )>::type >
 {
  typedef typename Traits::value_type  value_type ;
-  typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ;
+  typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
  typedef typename Traits::value_type & return_type ;
  typedef Kokkos::Impl::SharedAllocationTracker  track_type  ;

@ -2419,7 +2419,7 @@ struct ViewDataHandle< Traits ,
  static handle_type assign( value_type * arg_data_ptr
                           , track_type const & /*arg_tracker*/ )
  {
-    if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
    }
    return handle_type( arg_data_ptr );
@ -2429,7 +2429,7 @@ struct ViewDataHandle< Traits ,
  static handle_type assign( handle_type const arg_data_ptr
                           , size_t offset )
  {
-    if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
    }
    return handle_type( arg_data_ptr + offset );
@ -2783,6 +2783,11 @@ public:
    , m_offset( std::integral_constant< unsigned , 0 >() , arg_layout )
    {}

+  /**\brief  Assign data */
+  KOKKOS_INLINE_FUNCTION
+  void assign_data( pointer_type arg_ptr )
+    { m_handle = handle_type( arg_ptr ); }
+
  //----------------------------------------
  /*  Allocate and construct mapped array.
   *  Allocate via shared allocation record and
--- a/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
@ -48,6 +48,7 @@
 #include <algorithm>

 #include <Kokkos_Macros.hpp>
+#include <Kokkos_Core.hpp>
 #include <Kokkos_hwloc.hpp>
 #include <impl/Kokkos_Error.hpp>

@ -312,14 +313,18 @@ Sentinel::Sentinel()
  hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );

  if ( hwloc_bitmap_iszero( s_process_binding ) ) {
+    if (Kokkos::show_warnings() ) {
      std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl;
+    }
    const int pu_depth = hwloc_get_type_depth( s_hwloc_topology, HWLOC_OBJ_PU );
    int num_pu = 1;
    if ( pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN ) {
      num_pu = hwloc_get_nbobjs_by_depth( s_hwloc_topology, pu_depth );
    }
    else {
+      if (Kokkos::show_warnings() ) {
        std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl;
+      }
      num_pu = 1;
    }
    hwloc_bitmap_set_range( s_process_binding, 0, num_pu-1);
@ -349,7 +354,7 @@ Sentinel::Sentinel()

      hwloc_bitmap_free( s_process_no_core_zero );

-      if ( ! ok ) {
+      if ( Kokkos::show_warnings() && ! ok ) {
        std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
      }
    }
@ -503,8 +508,8 @@ Sentinel::Sentinel()

  hwloc_bitmap_free( proc_cpuset_location );

-  if ( ! symmetric ) {
-    std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
+  if ( Kokkos::show_warnings() && ! symmetric ) {
+    std::cerr << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
              << std::endl ;
  }
 }
--- a/lib/kokkos/core/unit_test/CMakeLists.txt
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@ -49,6 +49,7 @@ IF(Kokkos_ENABLE_Serial)
      serial/TestSerial_SubView_c10.cpp
      serial/TestSerial_SubView_c11.cpp
      serial/TestSerial_SubView_c12.cpp
+      serial/TestSerial_SubView_c13.cpp
      serial/TestSerial_Team.cpp
      serial/TestSerial_TeamReductionScan.cpp
      serial/TestSerial_TeamScratch.cpp
@ -96,6 +97,7 @@ IF(Kokkos_ENABLE_Pthread)
      threads/TestThreads_SubView_c10.cpp
      threads/TestThreads_SubView_c11.cpp
      threads/TestThreads_SubView_c12.cpp
+      threads/TestThreads_SubView_c13.cpp
      threads/TestThreads_Team.cpp
      threads/TestThreads_TeamReductionScan.cpp
      threads/TestThreads_TeamScratch.cpp
@ -143,6 +145,7 @@ IF(Kokkos_ENABLE_OpenMP)
      openmp/TestOpenMP_SubView_c10.cpp
      openmp/TestOpenMP_SubView_c11.cpp
      openmp/TestOpenMP_SubView_c12.cpp
+      openmp/TestOpenMP_SubView_c13.cpp
      openmp/TestOpenMP_Task.cpp
      openmp/TestOpenMP_Team.cpp
      openmp/TestOpenMP_TeamReductionScan.cpp
@ -184,6 +187,7 @@ IF(Kokkos_ENABLE_Qthreads)
      qthreads/TestQthreads_SubView_c10.cpp
      qthreads/TestQthreads_SubView_c11.cpp
      qthreads/TestQthreads_SubView_c12.cpp
+      qthreads/TestQthreads_SubView_c13.cpp
      qthreads/TestQthreads_Team.cpp
      qthreads/TestQthreads_ViewAPI_a.cpp
      qthreads/TestQthreads_ViewAPI_b.cpp
@ -235,6 +239,7 @@ IF(Kokkos_ENABLE_Cuda)
      cuda/TestCuda_SubView_c10.cpp
      cuda/TestCuda_SubView_c11.cpp
      cuda/TestCuda_SubView_c12.cpp
+      cuda/TestCuda_SubView_c13.cpp
      cuda/TestCuda_Task.cpp
      cuda/TestCuda_Team.cpp
      cuda/TestCuda_TeamReductionScan.cpp
--- a/lib/kokkos/core/unit_test/Makefile
+++ b/lib/kokkos/core/unit_test/Makefile
@ -56,6 +56,7 @@ else
 	OBJ_CUDA += TestCuda_SubView_c04.o TestCuda_SubView_c05.o TestCuda_SubView_c06.o
 	OBJ_CUDA += TestCuda_SubView_c07.o TestCuda_SubView_c08.o TestCuda_SubView_c09.o
 	OBJ_CUDA += TestCuda_SubView_c10.o TestCuda_SubView_c11.o TestCuda_SubView_c12.o
+	OBJ_CUDA += TestCuda_SubView_c13.o
 endif
 	OBJ_CUDA += TestCuda_Reductions.o TestCuda_Scan.o
 	OBJ_CUDA += TestCuda_Complex.o
@ -169,6 +170,7 @@ else
 	OBJ_OPENMP += TestOpenMP_SubView_c04.o TestOpenMP_SubView_c05.o TestOpenMP_SubView_c06.o
 	OBJ_OPENMP += TestOpenMP_SubView_c07.o TestOpenMP_SubView_c08.o TestOpenMP_SubView_c09.o
 	OBJ_OPENMP += TestOpenMP_SubView_c10.o TestOpenMP_SubView_c11.o TestOpenMP_SubView_c12.o
+	OBJ_OPENMP += TestOpenMP_SubView_c13.o
 endif
 	OBJ_OPENMP += TestOpenMP_Reductions.o TestOpenMP_Scan.o
 	OBJ_OPENMP += TestOpenMP_Complex.o
@ -258,6 +260,7 @@ else
        OBJ_SERIAL += TestSerial_SubView_c04.o TestSerial_SubView_c05.o TestSerial_SubView_c06.o
        OBJ_SERIAL += TestSerial_SubView_c07.o TestSerial_SubView_c08.o TestSerial_SubView_c09.o
        OBJ_SERIAL += TestSerial_SubView_c10.o TestSerial_SubView_c11.o TestSerial_SubView_c12.o
+        OBJ_SERIAL += TestSerial_SubView_c13.o
 endif
        OBJ_SERIAL += TestSerial_Reductions.o TestSerial_Scan.o
        OBJ_SERIAL += TestSerial_Complex.o
--- a/lib/kokkos/core/unit_test/TestAtomic.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomic.hpp
@ -467,6 +467,7 @@ TEST_F( TEST_CATEGORY, atomics )
  ASSERT_TRUE( ( TestAtomic::Loop< float, TEST_EXECSPACE >( 100, 3 ) ) );

 #ifndef KOKKOS_ENABLE_OPENMPTARGET
+#ifndef KOKKOS_ENABLE_ROCM
  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 3 ) ) );
@ -475,6 +476,7 @@ TEST_F( TEST_CATEGORY, atomics )
  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 3 ) ) );
 #endif
+#endif
 }


--- a/lib/kokkos/core/unit_test/TestComplex.hpp
+++ b/lib/kokkos/core/unit_test/TestComplex.hpp
@ -71,11 +71,13 @@ struct TestComplexConstruction {
    ASSERT_FLOAT_EQ(h_results(7).real(),7.5);  ASSERT_FLOAT_EQ(h_results(7).imag(),0.0);
    ASSERT_FLOAT_EQ(h_results(8).real(),double(8));  ASSERT_FLOAT_EQ(h_results(8).imag(),0.0);

+#ifndef KOKKOS_ENABLE_ROCM
    Kokkos::complex<double> a(1.5,2.5),b(3.25,5.25),r_kk;
    std::complex<double> sa(a),sb(3.25,5.25),r;
    r = a; r_kk = a;         ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag());
    r = sb*a; r_kk = b*a;    ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag());
    r = sa; r_kk = a;        ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag());
+#endif

  }

--- a/lib/kokkos/core/unit_test/TestCrs.hpp
+++ b/lib/kokkos/core/unit_test/TestCrs.hpp
@ -65,8 +65,8 @@ struct CountFillFunctor {

 template< class ExecSpace >
 void test_count_fill(std::int32_t nrows) {
-  Kokkos::Experimental::Crs<std::int32_t, ExecSpace, void, std::int32_t> graph;
-  Kokkos::Experimental::count_and_fill_crs(graph, nrows, CountFillFunctor<ExecSpace>());
+  Kokkos::Crs<std::int32_t, ExecSpace, void, std::int32_t> graph;
+  Kokkos::count_and_fill_crs(graph, nrows, CountFillFunctor<ExecSpace>());
  ASSERT_EQ(graph.numRows(), nrows);
  auto row_map = Kokkos::create_mirror_view(graph.row_map);
  Kokkos::deep_copy(row_map, graph.row_map);
--- a/lib/kokkos/core/unit_test/TestMDRange.hpp
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@ -2489,6 +2489,303 @@ struct TestMDRange_6D {
  }
 };

+
+template <typename ExecSpace >
+struct TestMDRange_2D_NegIdx {
+
+  using value_type = double;
+
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType**, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  DataType lower_offset[2];
+
+  TestMDRange_2D_NegIdx( const DataType L0, const DataType L1, const DataType N0, const DataType N1 ) : input_view( "input_view", N0 - L0, N1 - L1 ) 
+  {
+    lower_offset[0] = L0;
+    lower_offset[1] = L1;
+  }
+
+  // When using negative indices, must offset View appropriately as views cannot take a negative index
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j ) const
+  {
+    input_view( i - lower_offset[0], j - lower_offset[1] ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, value_type &lsum ) const
+  {
+    lsum += input_view( i - lower_offset[0], j - lower_offset[1] ) * 2;
+  }
+
+  static void test_2D_negidx( const int N0, const int N1 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const point_type lower{{-1, -1}};
+      const point_type upper{{N0, N1}};
+      const tile_type  tile{{8,8}};
+
+      range_type range( point_type{{ lower[0], lower[1] }}, point_type{{ upper[0], upper[1] }}, tile_type{{ tile[0], tile[1] }} );
+
+      TestMDRange_2D_NegIdx functor( lower[0], lower[1], upper[0], upper[1] );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_3D_NegIdx {
+
+  using value_type = double;
+
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType***, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  DataType lower_offset[3];
+
+  TestMDRange_3D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType N0, const DataType N1, const DataType N2 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2 ) 
+  {
+    lower_offset[0] = L0;
+    lower_offset[1] = L1;
+    lower_offset[2] = L2;
+  }
+
+  // When using negative indices, must offset View appropriately as views cannot take a negative index
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k ) const
+  {
+    input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2] ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, value_type &lsum ) const
+  {
+    lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2] ) * 2;
+  }
+
+  static void test_3D_negidx( const int N0, const int N1, const int N2 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const point_type lower{{-1, -1, -1}};
+      const point_type upper{{N0, N1, N2}};
+      const tile_type  tile{{8,8,2}};
+
+      range_type range( point_type{{ lower[0], lower[1], lower[2] }}, point_type{{ upper[0], upper[1], upper[2] }}, tile_type{{ tile[0], tile[1], tile[2] }} );
+
+      TestMDRange_3D_NegIdx functor( lower[0], lower[1], lower[2], upper[0], upper[1], upper[2] );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_4D_NegIdx {
+
+  using value_type = double;
+
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType****, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  DataType lower_offset[4];
+
+  TestMDRange_4D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType N0, const DataType N1, const DataType N2, const DataType N3 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3 ) 
+  {
+    lower_offset[0] = L0;
+    lower_offset[1] = L1;
+    lower_offset[2] = L2;
+    lower_offset[3] = L3;
+  }
+
+  // When using negative indices, must offset View appropriately as views cannot take a negative index
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l ) const
+  {
+    input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3] ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, value_type &lsum ) const
+  {
+    lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3] ) * 2;
+  }
+
+  static void test_4D_negidx( const int N0, const int N1, const int N2, const int N3 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const point_type lower{{-1, -1, -1, -1}};
+      const point_type upper{{N0, N1, N2, N3}};
+      const tile_type  tile{{8,8,2,2}};
+
+      range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3] }}, point_type{{ upper[0], upper[1], upper[2], upper[3] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3] }} );
+
+      TestMDRange_4D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], upper[0], upper[1], upper[2], upper[3] );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_5D_NegIdx {
+
+  using value_type = double;
+
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType*****, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  DataType lower_offset[5];
+
+  TestMDRange_5D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType L4, const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3, N4 - L4 ) 
+  {
+    lower_offset[0] = L0;
+    lower_offset[1] = L1;
+    lower_offset[2] = L2;
+    lower_offset[3] = L3;
+    lower_offset[4] = L4;
+  }
+
+  // When using negative indices, must offset View appropriately as views cannot take a negative index
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m ) const
+  {
+    input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4] ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const
+  {
+    lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4] ) * 2;
+  }
+
+  static void test_5D_negidx( const int N0, const int N1, const int N2, const int N3, const int N4 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const point_type lower{{-1, -1, -1, -1, -1}};
+      const point_type upper{{N0, N1, N2, N3, N4}};
+      const tile_type  tile{{8,4,2,2,2}};
+
+      range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3], lower[4] }}, point_type{{ upper[0], upper[1], upper[2], upper[3], upper[4] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3], tile[4] }} );
+
+      TestMDRange_5D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], lower[4], upper[0], upper[1], upper[2], upper[3], upper[4] );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) * (upper[4] - lower[4]) );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_6D_NegIdx {
+
+  using value_type = double;
+
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType******, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  DataType lower_offset[6];
+
+  TestMDRange_6D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType L4, const DataType L5, const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4, const DataType N5 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3, N4 - L4, N5 - L5 ) 
+  {
+    lower_offset[0] = L0;
+    lower_offset[1] = L1;
+    lower_offset[2] = L2;
+    lower_offset[3] = L3;
+    lower_offset[4] = L4;
+    lower_offset[5] = L5;
+  }
+
+  // When using negative indices, must offset View appropriately as views cannot take a negative index
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n ) const
+  {
+    input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4], n - lower_offset[5] ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const
+  {
+    lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4], n - lower_offset[5] ) * 2;
+  }
+
+  static void test_6D_negidx( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const point_type lower{{-1, -1, -1, -1, -1, -1}};
+      const point_type upper{{N0, N1, N2, N3, N4, N5}};
+      const tile_type  tile{{8,4,2,2,2,1}};
+
+      range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3], lower[4], lower[5] }}, point_type{{ upper[0], upper[1], upper[2], upper[3], upper[4], upper[5] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3], tile[4], tile[5] }} );
+
+      TestMDRange_6D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], lower[4], lower[5], upper[0], upper[1], upper[2], upper[3], upper[4], upper[5] );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) * (upper[4] - lower[4]) * (upper[5] - lower[5]) );
+    }
+  }
+};
+
+
 } // namespace

 TEST_F( TEST_CATEGORY , mdrange_for ) {
@ -2512,6 +2809,14 @@ TEST_F( TEST_CATEGORY , mdrange_array_reduce ) {
  TestMDRange_ReduceArray_2D< TEST_EXECSPACE >::test_arrayreduce2( 4, 5 );
  TestMDRange_ReduceArray_3D< TEST_EXECSPACE >::test_arrayreduce3( 4, 5, 10 );
 }
+
+TEST_F( TEST_CATEGORY , mdrange_neg_idx ) {
+  TestMDRange_2D_NegIdx< TEST_EXECSPACE >::test_2D_negidx( 128, 32 );
+  TestMDRange_3D_NegIdx< TEST_EXECSPACE >::test_3D_negidx( 128, 32, 8 );
+  TestMDRange_4D_NegIdx< TEST_EXECSPACE >::test_4D_negidx( 128, 32, 8, 8 );
+  TestMDRange_5D_NegIdx< TEST_EXECSPACE >::test_5D_negidx( 128, 32, 8, 8, 4 );
+  TestMDRange_6D_NegIdx< TEST_EXECSPACE >::test_6D_negidx( 128, 32, 8, 8, 4, 2 );
+}
 //#endif

 } // namespace Test
--- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp
+++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
@ -521,6 +521,101 @@ void test_memory_pool_corners( const bool print_statistics
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

+template< class DeviceType , class Enable = void >
+struct TestMemoryPoolHuge
+{
+  TestMemoryPoolHuge() {}
+
+  enum : size_t { num_superblock = 0 };
+
+  using value_type = long ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , long & err ) const noexcept {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const noexcept {}
+};
+
+template< class DeviceType >
+struct TestMemoryPoolHuge< DeviceType
+                         , typename std::enable_if<
+                           std::is_same< Kokkos::HostSpace
+                                       , typename DeviceType::memory_space >
+                             ::value >::type
+                         >
+{
+  typedef Kokkos::View< uintptr_t * , DeviceType >  ptrs_type ;
+  typedef Kokkos::MemoryPool< DeviceType >          pool_type ;
+  typedef typename DeviceType::memory_space         memory_space ;
+
+  pool_type pool ;
+  ptrs_type ptrs ;
+
+  enum : size_t { min_block_size = 512
+                , max_block_size = 1lu << 31
+                , min_superblock_size = max_block_size
+                , num_superblock = 4 
+                , total_alloc_size = num_superblock * max_block_size };
+
+  TestMemoryPoolHuge()
+    : pool( memory_space()
+           , total_alloc_size
+           , min_block_size
+           , max_block_size
+           , min_superblock_size )
+    , ptrs( "ptrs" , num_superblock )
+    {}
+
+  // Specify reduction argument value_type to
+  // avoid confusion with tag-dispatch.
+
+  using value_type = long ;
+
+  void operator()( int i , long & err ) const noexcept
+    {
+      if ( i < int(num_superblock) ) {
+        ptrs(i) = (uintptr_t) pool.allocate( max_block_size );
+#if 0
+        printf("TestMemoryPoolHuge size(0x%lx) ptr(0x%lx)\n"
+              , max_block_size
+              , ptrs(i) );
+#endif
+        if ( ! ptrs(i) ) {
+          Kokkos::abort("TestMemoryPoolHuge");
+          ++err ;
+        }
+      }
+    }
+
+  void operator()( int i ) const noexcept
+    {
+      if ( i < int(num_superblock) ) {
+        pool.deallocate( (void*) ptrs(i) , max_block_size );
+        ptrs(i) = 0 ;
+      }
+    }
+};
+
+template< class DeviceType >
+void test_memory_pool_huge()
+{
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef TestMemoryPoolHuge< DeviceType >      functor_type ;
+  typedef Kokkos::RangePolicy< execution_space > policy_type ;
+
+  functor_type f ;
+  policy_type policy( 0 , functor_type::num_superblock );
+
+  long err = 0 ;
+
+  Kokkos::parallel_reduce( policy , f , err );
+  Kokkos::parallel_for( policy , f );
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
 } // namespace TestMemoryPool

 namespace Test {
@ -531,6 +626,7 @@ TEST_F( TEST_CATEGORY, memory_pool )
  TestMemoryPool::test_host_memory_pool_stats<>();
  TestMemoryPool::test_memory_pool_v2< TEST_EXECSPACE >(false,false);
  TestMemoryPool::test_memory_pool_corners< TEST_EXECSPACE >(false,false);
+  TestMemoryPool::test_memory_pool_huge< TEST_EXECSPACE >();
 }

 }
--- a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
+++ b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
@ -141,13 +141,13 @@ struct TestFib

    enum { MinBlockSize   =   64 };
    enum { MaxBlockSize   = 1024 };
-    enum { SuperBlockSize = 1u << 12 };
+    enum { SuperBlockSize = 4096 };

    sched_type root_sched( memory_space()
                         , MemoryCapacity
                         , MinBlockSize
-                         , MaxBlockSize
-                         , SuperBlockSize );
+                         , std::min(size_t(MaxBlockSize),MemoryCapacity)
+                         , std::min(size_t(SuperBlockSize),MemoryCapacity) );

    future_type f = Kokkos::host_spawn( Kokkos::TaskSingle( root_sched )
                                      , TestFib( root_sched, i ) );
@ -205,11 +205,10 @@ struct TestTaskSpawn {
  {
    typedef typename sched_type::memory_space memory_space;

-    // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool.
    enum { MemoryCapacity = 16000 };
    enum { MinBlockSize   =   64 };
    enum { MaxBlockSize   = 1024 };
-    enum { SuperBlockSize = 1u << 12 };
+    enum { SuperBlockSize = 4096 };

    sched_type sched( memory_space()
                    , MemoryCapacity
@ -277,11 +276,10 @@ struct TestTaskDependence {
  {
    typedef typename sched_type::memory_space memory_space;

-    // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool.
    enum { MemoryCapacity = 16000 };
    enum { MinBlockSize   =   64 };
    enum { MaxBlockSize   = 1024 };
-    enum { SuperBlockSize = 1u << 12 };
+    enum { SuperBlockSize = 4096 };

    sched_type sched( memory_space()
                    , MemoryCapacity
@ -471,13 +469,11 @@ struct TestTaskTeam {

  static void run( long n )
  {
-    //const unsigned memory_capacity = 10000; // Causes memory pool infinite loop.
-    //const unsigned memory_capacity = 100000; // Fails with SPAN=1 for serial and OMP.
    const unsigned memory_capacity = 400000;

    enum { MinBlockSize   =   64 };
    enum { MaxBlockSize   = 1024 };
-    enum { SuperBlockSize = 1u << 12 };
+    enum { SuperBlockSize = 4096 };

    sched_type root_sched( typename sched_type::memory_space()
                         , memory_capacity
@ -600,12 +596,11 @@ struct TestTaskTeamValue {

  static void run( long n )
  {
-    //const unsigned memory_capacity = 10000; // Causes memory pool infinite loop.
    const unsigned memory_capacity = 100000;

    enum { MinBlockSize   =   64 };
    enum { MaxBlockSize   = 1024 };
-    enum { SuperBlockSize = 1u << 12 };
+    enum { SuperBlockSize = 4096 };

    sched_type root_sched( typename sched_type::memory_space()
                         , memory_capacity
@ -655,7 +650,6 @@ TEST_F( TEST_CATEGORY, task_fib )
 TEST_F( TEST_CATEGORY, task_depend )
 {
  for ( int i = 0; i < 25; ++i ) {
-printf("\nTest::task_depend %d\n",i);
    TestTaskScheduler::TestTaskDependence< TEST_EXECSPACE >::run( i );
  }
 }
--- a/lib/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp
@ -786,11 +786,40 @@ struct TestViewMirror
    ASSERT_EQ( a_h.dimension_0(), a_d .dimension_0() );
  }

+  template< class MemoryTraits >
+  void static test_mirror_copy() {
+    Kokkos::View< double*, Layout, Kokkos::HostSpace > a_org( "A", 10 );
+    a_org(5) = 42.0;
+    Kokkos::View< double*, Layout, Kokkos::HostSpace, MemoryTraits > a_h = a_org;
+    auto a_h2 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_h );
+    auto a_d = Kokkos::create_mirror_view_and_copy( DeviceType(), a_h );
+    auto a_h3 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_d );
+
+    int equal_ptr_h_h2 = a_h.data()  == a_h2.data() ? 1 : 0;
+    int equal_ptr_h_d  = a_h.data()  ==  a_d.data() ? 1 : 0;
+    int equal_ptr_h2_d = a_h2.data() ==  a_d.data() ? 1 : 0;
+    int equal_ptr_h3_d = a_h3.data() ==  a_d.data() ? 1 : 0;
+
+    int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
+    ASSERT_EQ( equal_ptr_h_h2, 1 );
+    ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
+    ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
+    ASSERT_EQ( equal_ptr_h3_d, is_same_memspace );
+
+    ASSERT_EQ( a_h.dimension_0(), a_h3.dimension_0() );
+    ASSERT_EQ( a_h.dimension_0(), a_h2.dimension_0() );
+    ASSERT_EQ( a_h.dimension_0(), a_d .dimension_0() );
+    ASSERT_EQ( a_org(5), a_h3(5) );
+  }
+
+
  void static testit() {
    test_mirror< Kokkos::MemoryTraits<0> >();
    test_mirror< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
    test_mirror_view< Kokkos::MemoryTraits<0> >();
    test_mirror_view< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
+    test_mirror_copy< Kokkos::MemoryTraits<0> >();
+    test_mirror_copy< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
  }
 };

@ -1312,10 +1341,12 @@ return;
  }
 };

+#if !defined(KOKKOS_ENABLE_ROCM)
 TEST_F( TEST_CATEGORY, view_api )
 {
  TestViewAPI< double, TEST_EXECSPACE >();
 }
+#endif

 TEST_F( TEST_CATEGORY, view_remap )
 {
--- a/lib/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp
@ -1294,5 +1294,41 @@ void test_layoutright_to_layoutright() {
  }
 }

+//----------------------------------------------------------------------------
+
+template< class Space >
+struct TestUnmanagedSubviewReset
+{
+  Kokkos::View<int****,Space> a ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const noexcept
+    {
+      auto sub_a = Kokkos::subview(a,0,Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
+
+      for ( int i = 0 ; i < int(a.dimension(0)) ; ++i ) {
+        sub_a.assign_data( & a(i,0,0,0) );
+        if ( & sub_a(1,1,1) != & a(i,1,1,1) ) {
+          Kokkos::abort("TestUnmanagedSubviewReset");
+        }
+      }
+    }
+
+  TestUnmanagedSubviewReset()
+    : a( Kokkos::view_alloc() , 20 , 10 , 5 , 2 )
+    {}
+};
+
+template< class Space >
+void test_unmanaged_subview_reset()
+{
+  Kokkos::parallel_for
+    ( Kokkos::RangePolicy< typename Space::execution_space >(0,1)
+    , TestUnmanagedSubviewReset<Space>()
+    );
+}
+
 } // namespace TestViewSubview
+
 #endif
+
--- a/lib/kokkos/core/unit_test/TestWorkGraph.hpp
+++ b/lib/kokkos/core/unit_test/TestWorkGraph.hpp
@ -67,7 +67,7 @@ template< class ExecSpace >
 struct TestWorkGraph {

  using MemorySpace = typename ExecSpace::memory_space;
-  using Policy = Kokkos::Experimental::WorkGraphPolicy<std::int32_t, ExecSpace>;
+  using Policy = Kokkos::WorkGraphPolicy<std::int32_t, ExecSpace>;
  using Graph = typename Policy::graph_type;
  using RowMap = typename Graph::row_map_type;
  using Entries = typename Graph::entries_type;
@ -117,6 +117,7 @@ struct TestWorkGraph {
    m_graph.row_map = RowMap("row_map", hg.size() + 1); // row map always has one more
    m_graph.entries = Entries("entries", hg.size() - 1); // all but the first have a parent
    m_values = Values("values", hg.size());
+    //printf("%zu work items\n", hg.size());
    auto h_row_map = Kokkos::create_mirror_view(m_graph.row_map);
    auto h_entries = Kokkos::create_mirror_view(m_graph.entries);
    auto h_values = Kokkos::create_mirror_view(m_values);
@ -156,17 +157,15 @@ struct TestWorkGraph {

 } // anonymous namespace

-TEST_F( TEST_CATEGORY, DISABLED_workgraph_fib )
+TEST_F( TEST_CATEGORY, workgraph_fib )
 {
-  #ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
-  int limit = 15;
-  #else
  int limit = 27;
-  #endif
  for ( int i = 0; i < limit; ++i) {
    TestWorkGraph< TEST_EXECSPACE > f(i);
    f.test_for();
  }
+  //TestWorkGraph< TEST_EXECSPACE > f(2);
+  //f.test_for();
 }

 } // namespace Test
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c13.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c13.cpp
@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
+{
+  TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
+}
+
+} // namespace Test
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp
@ -10,3 +10,4 @@
 #include <cuda/TestCuda_SubView_c10.cpp>
 #include <cuda/TestCuda_SubView_c11.cpp>
 #include <cuda/TestCuda_SubView_c12.cpp>
+#include <cuda/TestCuda_SubView_c13.cpp>
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c13.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c13.cpp
@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
+{
+  TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
+}
+
+} // namespace Test
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp
@ -10,3 +10,4 @@
 #include <openmp/TestOpenMP_SubView_c10.cpp>
 #include <openmp/TestOpenMP_SubView_c11.cpp>
 #include <openmp/TestOpenMP_SubView_c12.cpp>
+#include <openmp/TestOpenMP_SubView_c13.cpp>
--- a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c13.cpp
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c13.cpp
@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_test_unmanaged_subview_reset )
+{
+#if 0
+  TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
+#endif
+}
+
+} // namespace Test
--- a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp
@ -10,3 +10,4 @@
 #include <qthreads/TestQthreads_SubView_c10.cpp>
 #include <qthreads/TestQthreads_SubView_c11.cpp>
 #include <qthreads/TestQthreads_SubView_c12.cpp>
+#include <qthreads/TestQthreads_SubView_c13.cpp>
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c13.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c13.cpp
@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
+{
+  TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
+}
+
+} // namespace Test
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp
@ -10,3 +10,4 @@
 #include <serial/TestSerial_SubView_c10.cpp>
 #include <serial/TestSerial_SubView_c11.cpp>
 #include <serial/TestSerial_SubView_c12.cpp>
+#include <serial/TestSerial_SubView_c13.cpp>
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c13.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c13.cpp
@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
+{
+  TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
+}
+
+} // namespace Test
--- a/lib/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf
+++ b/lib/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf
--- a/lib/kokkos/example/feint/Makefile
+++ b/lib/kokkos/example/feint/Makefile
@ -31,6 +31,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  OBJ_EXAMPLE_FEINT += feint_cuda.o
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+  OBJ_EXAMPLE_FEINT += feint_rocm.o
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
  OBJ_EXAMPLE_FEINT += feint_threads.o
 endif
--- a/lib/kokkos/example/feint/feint_rocm.cpp
+++ b/lib/kokkos/example/feint/feint_rocm.cpp
@ -0,0 +1,67 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_ROCM )
+
+#include <feint.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template void feint<Kokkos::Experimental::ROCm,false>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+template void feint<Kokkos::Experimental::ROCm,true>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif
+
--- a/lib/kokkos/example/fenl/fenl.cpp
+++ b/lib/kokkos/example/fenl/fenl.cpp
@ -110,6 +110,27 @@ Perf fenl< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemQuadratic >(

 #endif

+#if defined( KOKKOS_ENABLE_ROCM )
+
+template
+Perf fenl< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+
+template
+Perf fenl< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemQuadratic >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+#endif
+

 } /* namespace FENL */
 } /* namespace Example */
--- a/lib/kokkos/example/fenl/main.cpp
+++ b/lib/kokkos/example/fenl/main.cpp
@ -67,6 +67,7 @@ enum { CMD_USE_THREADS = 0
     , CMD_USE_NUMA
     , CMD_USE_CORE_PER_NUMA
     , CMD_USE_CUDA
+     , CMD_USE_ROCM
     , CMD_USE_OPENMP
     , CMD_USE_CUDA_DEV
     , CMD_USE_FIXTURE_X
@ -114,6 +115,9 @@ void print_cmdline( std::ostream & s , const int cmd[] )
  if ( cmd[ CMD_USE_CUDA ] ) {
    s << " CUDA(" << cmd[ CMD_USE_CUDA_DEV ] << ")" ;
  }
+  if ( cmd[ CMD_USE_ROCM ] ) {
+    s << " ROCM" ;
+  }
  if ( cmd[ CMD_USE_ATOMIC ] ) {
    s << " ATOMIC" ;
  }
@ -167,6 +171,7 @@ void run( MPI_Comm comm , const int cmd[] )
    if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; }
    else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; }
    else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; }
+    else if ( cmd[ CMD_USE_ROCM ] ) { std::cout << "ROCM" ; }

    if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; }
    else { std::cout << " , LINEAR-ELEMENT" ; }
@ -288,6 +293,9 @@ int main( int argc , char ** argv )
        cmdline[ CMD_USE_CUDA ] = 1 ;
        cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
      }
+      else if ( 0 == strcasecmp( argv[i] , "rocm" ) ) {
+        cmdline[ CMD_USE_ROCM ] = 1 ;
+      }
      else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) {
        sscanf( argv[++i] , "%dx%dx%d" ,
                cmdline + CMD_USE_FIXTURE_X ,
@ -410,6 +418,21 @@ int main( int argc , char ** argv )
      Kokkos::HostSpace::execution_space::finalize();
    }

+#endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+    if ( cmdline[ CMD_USE_ROCM ] ) {
+      // Use the last device:
+
+      Kokkos::HostSpace::execution_space::initialize();
+      Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( cmdline[ CMD_USE_ROCM ] ) );
+
+      run< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+
+      Kokkos::Experimental::ROCm::finalize();
+      Kokkos::HostSpace::execution_space::finalize();
+    }
+
 #endif

  }
--- a/lib/kokkos/example/fixture/Main.cpp
+++ b/lib/kokkos/example/fixture/Main.cpp
@ -300,5 +300,16 @@ int main()
    Kokkos::HostSpace::execution_space::finalize();
  }
 #endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+  {
+    std::cout << "test_fixture< ROCm >" << std::endl ;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
+    Kokkos::Example::test_fixture< Kokkos::Experimental::ROCm >();
+    Kokkos::Experimental::ROCm::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+#endif
 }

--- a/lib/kokkos/example/fixture/TestFixture.cpp
+++ b/lib/kokkos/example/fixture/TestFixture.cpp
@ -53,6 +53,10 @@ template void test_fixture< Kokkos::HostSpace::execution_space >();
 template void test_fixture<Kokkos::Cuda>();
 #endif

+#if defined( KOKKOS_ENABLE_ROCM )
+template void test_fixture<Kokkos::Experimental::ROCm>();
+#endif
+
 } /* namespace Example */
 } /* namespace Kokkos */

--- a/lib/kokkos/example/tutorial/02_simple_reduce/Makefile
+++ b/lib/kokkos/example/tutorial/02_simple_reduce/Makefile
@ -23,6 +23,15 @@ EXE = 02_simple_reduce.host
 KOKKOS_DEVICES = "OpenMP"
 KOKKOS_ARCH = "SNB"
 endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = 02_simple_reduce.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif

 DEPFLAGS = -M

@ -40,7 +49,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)

 clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
+	rm -f *.o *.cuda *.host *.rocm

 # Compilation rules

--- a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
+++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
@ -24,6 +24,16 @@ EXE = 02_simple_reduce_lambda.host
 KOKKOS_DEVICES = "OpenMP"
 KOKKOS_ARCH = "SNB"
 endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =
+EXE = 02_simple_reduce_lambda.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+

 DEPFLAGS = -M

@ -41,7 +51,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)

 clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
+	rm -f *.o *.cuda *.host *.rocm

 # Compilation rules

--- a/lib/kokkos/example/tutorial/03_simple_view/Makefile
+++ b/lib/kokkos/example/tutorial/03_simple_view/Makefile
@ -23,6 +23,16 @@ EXE = 03_simple_view.host
 KOKKOS_DEVICES = "OpenMP"
 KOKKOS_ARCH = "SNB"
 endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =
+EXE = 03_simple_view.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+

 DEPFLAGS = -M

@ -41,7 +51,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)

 clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
+	rm -f *.o *.cuda *.host *.rocm

 # Compilation rules

--- a/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile
+++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile
@ -24,6 +24,16 @@ EXE = 03_simple_view_lambda.host
 KOKKOS_DEVICES = "OpenMP"
 KOKKOS_ARCH = "SNB"
 endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =
+EXE = 03_simple_view_lambda.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+

 DEPFLAGS = -M

@ -41,7 +51,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)

 clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
+	rm -f *.o *.cuda *.host *.rocm

 # Compilation rules

--- a/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile
+++ b/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile
@ -23,6 +23,16 @@ EXE = 04_simple_memoryspaces.host
 KOKKOS_DEVICES = "OpenMP"
 KOKKOS_ARCH = "SNB"
 endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =
+EXE = 04_simple_memoryspaces.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+

 DEPFLAGS = -M

@ -40,7 +50,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)

 clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
+	rm -f *.o *.cuda *.host *.rocm

 # Compilation rules

--- a/lib/kokkos/example/tutorial/05_simple_atomics/Makefile
+++ b/lib/kokkos/example/tutorial/05_simple_atomics/Makefile
@ -23,6 +23,16 @@ EXE = 05_simple_atomics.host
 KOKKOS_DEVICES = "OpenMP"
 KOKKOS_ARCH = "SNB"
 endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =
+EXE = 05_simple_atomics.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+

 DEPFLAGS = -M

@ -40,7 +50,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)

 clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
+	rm -f *.o *.cuda *.host *.rocm

 # Compilation rules

--- a/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt
+++ b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt
@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_06_simple_mdrangepolicy
+  SOURCES simple_mdrangepolicy.cpp
+  COMM serial mpi
+  )
--- a/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile
+++ b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile
@ -0,0 +1,48 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/06_simple_mdrangepolicy/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = 06_simple_mdrangepolicy.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = 06_simple_mdrangepolicy.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
--- a/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
+++ b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
@ -0,0 +1,201 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+//
+// MDRangePolicy example with parallel_for and parallel_reduce:
+//   1. Start up Kokkos
+//   2. Execute a parallel_for loop in the default execution space,
+//      using a functor to define the loop body
+//   3. Shut down Kokkos
+//
+// Two examples are provided:
+// Example 1: Rank 2 case with minimal default parameters and arguments used 
+//            in the MDRangePolicy
+//
+// Example 2: Rank 3 case with additional outer/inner iterate pattern parameters
+//            and tile dims passed to the ctor
+
+
+// Simple functor for computing/storing the product of indices in a View v
+template < class ViewType >
+struct MDFunctor {
+
+  typedef long value_type;
+
+  ViewType v;
+  size_t   size;
+
+  MDFunctor( const ViewType & v_, const size_t size_ )
+    : v(v_), size(size_) {}
+
+  // 2D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, const int j) const {
+    v(i,j) = i*j; // compute the product of indices
+  }
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, const int j, const int k) const {
+    v(i,j,k) = i*j*k; // compute the product of indices
+  }
+
+  // 2D case - reduction
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, const int j, value_type & incorrect_count) const {
+    if ( v(i,j) != i*j ) {
+      incorrect_count += 1;
+    }
+  }
+
+  // 3D case - reduction
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, const int j, const int k, value_type & incorrect_count) const {
+    if ( v(i,j,k) != i*j*k ) {
+      incorrect_count += 1;
+    }
+  }
+
+};
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+
+  // Bound(s) for MDRangePolicy 
+  const int n = 100;
+
+  // ViewType typedefs for Rank<2>, Rank<3> for example usage
+  typedef double ScalarType;
+  typedef typename Kokkos::View<ScalarType**>   ViewType_2D;
+  typedef typename Kokkos::View<ScalarType***>  ViewType_3D;
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Explanation of MDRangePolicy usage, template parameters, constructor arguments
+  //
+  // MDRangePolicy typedefs for Rank<2>, Rank<3> cases
+  // Required template parameters: 
+  //   Kokkos::Rank<N>: where N=rank
+  //
+  // Optional template parameters to Rank<...>:
+  //   Kokkos::Iterate::{Default,Left,Right}: Outer iteration pattern across tiles; 
+  //     defaults based on the execution space similar to Kokkos::Layout
+  //   Kokkos::Iterate::{Default,Left,Right}: Inner iteration pattern within tiles; 
+  //     defaults based on the execution space similar to Kokkos::Layout
+  //
+  //   e.g. typedef Rank<2, Iterate::Left, Iterate::Left> rank2ll;
+  //
+  //
+  // Optional template parameters to MDRangePolicy:
+  //   ExecutionSpace: Kokkos::Serial, Kokkos::OpenMP, Kokkos::Cuda, etc. 
+  //
+  //   Kokkos::IndexType< T >: where T = int, long, unsigned int, etc.
+  //
+  //   struct Tag{}: A user-provided tag for tagging functor operators
+  //
+  //   e.g. 1:  MDRangePolicy< Kokkos::Serial, Rank<2, Iterate::Left, Iterate::Left>, IndexType<int>, Tag > mdpolicy;
+  //   e.g. 2:  MDRangePolicy< Kokkos::Serial, rank2ll, IndexType<int>, Tag > mdpolicy;
+  //
+  //
+  // Required arguments to ctor:
+  //   {{ l0, l1, ... }}: Lower bounds, provided as Kokkos::Array or std::initializer_list
+  //   {{ u0, u1, ... }}: Upper bounds, provided as Kokkos::Array or std::initializer_list
+  //
+  // Optional arguments to ctor:
+  //   {{ t0, t1, ... }}: Tile dimensions, provided as Kokkos::Array or std::initializer_list
+  //                      defaults based on the execution space
+  //
+  //  e.g. mdpolicy( {{0,0}}, {{u0,u1}}, {{t0,t1}};
+  //   
+  /////////////////////////////////////////////////////////////////////////////
+
+  // Example 1: 
+  long incorrect_count_2d = 0;
+  {
+    // Rank<2> Case: Rank is provided, all other parameters are default
+    typedef typename Kokkos::Experimental::MDRangePolicy< Kokkos::Experimental::Rank<2> > MDPolicyType_2D;
+
+    // Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims defaulted
+    MDPolicyType_2D mdpolicy_2d( {{0,0}}, {{n,n}} );
+
+    // Construct a 2D view to store result of product of indices
+    ViewType_2D v2("v2", n, n);
+
+    // Execute parallel_for with rank 2 MDRangePolicy
+    Kokkos::parallel_for( "md2d", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n) );
+
+    // Check results with a parallel_reduce using the MDRangePolicy
+    Kokkos::parallel_reduce( "md2dredux", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n), incorrect_count_2d );
+
+    printf("Rank 2 MDRangePolicy incorrect count: %ld\n", incorrect_count_2d); // should be 0
+  }
+
+
+  // Example 2: 
+  long incorrect_count_3d = 0;
+  {
+    // Rank<3> Case: Rank, inner iterate pattern, outer iterate pattern provided
+    typedef typename Kokkos::Experimental::MDRangePolicy< Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left, Kokkos::Experimental::Iterate::Left> > MDPolicyType_3D;
+
+    // Construct 3D MDRangePolicy: lower, upper bounds, tile dims provided
+    MDPolicyType_3D mdpolicy_3d( {{0,0,0}}, {{n,n,n}}, {{4,4,4}} );
+
+    // Construct a 3D view to store result of product of indices
+    ViewType_3D v3("v3", n, n, n);
+
+    // Execute parallel_for with rank 3 MDRangePolicy
+    Kokkos::parallel_for( "md3d", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n) );
+
+    // Check results with a parallel_reduce using the MDRangePolicy
+    Kokkos::parallel_reduce( "md3dredux", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n), incorrect_count_3d );
+
+    printf("Rank 3 MDRangePolicy incorrect count: %ld\n", incorrect_count_3d); // should be 0
+  }
+
+  Kokkos::finalize ();
+
+  return (incorrect_count_2d == long(0) && incorrect_count_3d == long(0)) ? 0 : -1;
+}
+
--- a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile
+++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile
@ -23,6 +23,16 @@ EXE = 01_data_layouts.host
 KOKKOS_DEVICES = "OpenMP"
 KOKKOS_ARCH = "SNB"
 endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =
+EXE = 01_data_layouts.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+

 DEPFLAGS = -M

@ -40,7 +50,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)

 clean: kokkos-clean 
-	rm -f *.o *.cuda *.host
+	rm -f *.o *.cuda *.host *.rocm

 # Compilation rules

--- a/Show More
+++ b/Show More