Update Kokkos to v2.04.11

2017-11-06 13:47:33 -07:00
parent 39df9f5d94
commit 16b5315845
117 changed files with 33239 additions and 1093 deletions
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
@ -63,7 +63,7 @@
 #include <typeinfo>
 #endif

-namespace Kokkos { namespace Experimental { namespace Impl {
+namespace Kokkos { namespace Impl {

 // ------------------------------------------------------------------ //

@ -110,21 +110,12 @@ struct apply_impl<2,RP,Functor,void >
  {
 // LL
  if (RP::inner_direction == RP::Left) {
- /*
-    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
-    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
-
-    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
-    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
-            m_func(i, j);
-    } }
-*/
    for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
      if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

        for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
          if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
            m_func(offset_0 , offset_1);
          }
@ -134,21 +125,12 @@ struct apply_impl<2,RP,Functor,void >
  }
 // LR
  else {
-/*
-    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
-    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
-
-    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
-    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
-            m_func(i, j);
-    } }
-*/
    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {

        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
            m_func(offset_0 , offset_1);
          }
@ -182,21 +164,12 @@ struct apply_impl<2,RP,Functor,Tag>
  {
  if (RP::inner_direction == RP::Left) {
    // Loop over size maxnumblocks until full range covered
-/*
-    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
-    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
-
-    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
-    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
-            m_func(Tag(), i, j);
-    } }
-*/
    for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
      if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

        for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
          if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
            m_func(Tag(), offset_0 , offset_1);
          }
@ -205,21 +178,12 @@ struct apply_impl<2,RP,Functor,Tag>
    }
  }
  else {
-/*
-    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
-    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
-
-    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
-    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
-            m_func(Tag(), i, j);
-    } }
-*/
    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {

        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
            m_func(Tag(), offset_0 , offset_1);
          }
@ -255,15 +219,15 @@ struct apply_impl<3,RP,Functor,void >
 // LL
    if (RP::inner_direction == RP::Left) {
      for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
        if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {

          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

              for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
                  m_func(offset_0 , offset_1 , offset_2);
                }
@ -276,15 +240,15 @@ struct apply_impl<3,RP,Functor,void >
 // LR
  else {
    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {

        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

            for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-              const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+              const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
              if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
                m_func(offset_0 , offset_1 , offset_2);
              }
@ -319,15 +283,15 @@ struct apply_impl<3,RP,Functor,Tag>
  {
    if (RP::inner_direction == RP::Left) {
      for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
        if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {

          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

              for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
                  m_func(Tag(), offset_0 , offset_1 , offset_2);
                }
@ -339,15 +303,15 @@ struct apply_impl<3,RP,Functor,Tag>
    }
    else {
      for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {

          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {

              for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
                  m_func(Tag(), offset_0 , offset_1 , offset_2);
                }
@ -398,19 +362,19 @@ struct apply_impl<4,RP,Functor,void >
      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];

      for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
        if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {

          for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
            if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {

              for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                  for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -436,19 +400,19 @@ struct apply_impl<4,RP,Functor,void >
      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {

                  for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -498,19 +462,19 @@ struct apply_impl<4,RP,Functor,Tag>
      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];

      for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
        if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {

          for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
            if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {

              for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                  for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                      m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -535,19 +499,19 @@ struct apply_impl<4,RP,Functor,Tag>
      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {

                  for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
                      m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -612,23 +576,23 @@ struct apply_impl<5,RP,Functor,void >
      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];

      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {

          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                          m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                        }
@ -667,23 +631,23 @@ struct apply_impl<5,RP,Functor,void >
      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
                          m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                        }
@ -747,23 +711,23 @@ struct apply_impl<5,RP,Functor,Tag>
      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];

      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {

          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                        }
@ -802,23 +766,23 @@ struct apply_impl<5,RP,Functor,Tag>
      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                        }
@ -895,27 +859,27 @@ struct apply_impl<6,RP,Functor,void >
      const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];

      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {

          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                              m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                            }
@ -967,27 +931,27 @@ struct apply_impl<6,RP,Functor,void >
      const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                              m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                            }
@ -1064,27 +1028,27 @@ struct apply_impl<6,RP,Functor,Tag>
      const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];

      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {

          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                            }
@ -1136,27 +1100,27 @@ struct apply_impl<6,RP,Functor,Tag>
      const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                            }
@ -1292,7 +1256,7 @@ protected:
  const Functor    m_func;
 };

-} } } //end namespace Kokkos::Experimental::Impl
+} } //end namespace Kokkos::Impl

 #endif
 #endif
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
@ -63,7 +63,7 @@
 #include <typeinfo>
 #endif

-namespace Kokkos { namespace Experimental { namespace Impl {
+namespace Kokkos { namespace Impl {

 namespace Refactor {

@ -2709,7 +2709,7 @@ private:

 // ----------------------------------------------------------------------------------

-} } } //end namespace Kokkos::Experimental::Impl
+} } //end namespace Kokkos::Impl

 #endif
 #endif
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()

 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-//__launch_bounds__(maxTperB, minBperSM)
+__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_constant_memory()
 {
  const DriverType & driver =
@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )

 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-//__launch_bounds__(maxTperB, minBperSM)
+__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_local_memory( const DriverType driver )
 {
  driver();
@ -193,9 +193,14 @@ template < class DriverType
         , bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
 struct CudaParallelLaunch ;

-template < class DriverType, class LaunchBounds >
-struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
-
+template < class DriverType
+         , unsigned int MaxThreadsPerBlock
+         , unsigned int MinBlocksPerSM >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds< MaxThreadsPerBlock 
+                                               , MinBlocksPerSM >
+                         , true >
+{
  inline
  CudaParallelLaunch( const DriverType & driver
                    , const dim3       & grid
@ -216,21 +221,28 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
      }
-      #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
-      else if ( shmem ) {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
-      } else {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_constant_memory
+                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
      }
      #endif

      // Copy functor to constant memory on the device
-      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
+      cudaMemcpyToSymbol(
+        kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );

      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();

      // Invoke the driver function on the device
-      cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
+      cuda_parallel_launch_constant_memory
+        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+          <<< grid , block , shmem , stream >>>();

 #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
      CUDA_SAFE_CALL( cudaGetLastError() );
@ -240,9 +252,11 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
  }
 };

-template < class DriverType, class LaunchBounds >
-struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
-
+template < class DriverType >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds<>
+                         , true >
+{
  inline
  CudaParallelLaunch( const DriverType & driver
                    , const dim3       & grid
@ -252,20 +266,136 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
  {
    if ( grid.x && ( block.x * block.y * block.z ) ) {

+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      // Fence before changing settings and copying closure
+      Kokkos::Cuda::fence();
+
      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
      }
-      #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
-      else if ( shmem ) {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
-      } else {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_constant_memory< DriverType >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbol(
+        kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory< DriverType >
+          <<< grid , block , shmem , stream >>>();
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+template < class DriverType
+         , unsigned int MaxThreadsPerBlock
+         , unsigned int MinBlocksPerSM >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds< MaxThreadsPerBlock 
+                                               , MinBlocksPerSM >
+                         , false >
+{
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_local_memory
+                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
      }
      #endif

      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();

-      cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
+      // Invoke the driver function on the device
+      cuda_parallel_launch_local_memory
+        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+          <<< grid , block , shmem , stream >>>( driver );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds<>
+                         , false >
+{
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_local_memory< DriverType >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_local_memory< DriverType >
+          <<< grid , block , shmem , stream >>>( driver );

 #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
      CUDA_SAFE_CALL( cudaGetLastError() );
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -713,7 +713,7 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void *
 // Iterate records to print orphaned memory ...
 void
 SharedAllocationRecord< Kokkos::CudaSpace , void >::
-print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail )
+print_records( std::ostream & s , const Kokkos::CudaSpace & , bool detail )
 {
  SharedAllocationRecord< void , void > * r = & s_root_record ;

@ -751,7 +751,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
              , reinterpret_cast<uintptr_t>( r->m_dealloc )
              , head.m_label
              );
-      std::cout << buffer ;
+      s << buffer ;
      r = r->m_next ;
    } while ( r != & s_root_record );
  }
@ -781,7 +781,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
      else {
        snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" );
      }
-      std::cout << buffer ;
+      s << buffer ;
      r = r->m_next ;
    } while ( r != & s_root_record );
  }
@ -789,14 +789,14 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail

 void
 SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
-print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail )
+print_records( std::ostream & s , const Kokkos::CudaUVMSpace & , bool detail )
 {
  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail );
 }

 void
 SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
-print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail )
+print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & , bool detail )
 {
  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
 }
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -421,7 +421,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
      std::string msg = ss.str();
      Kokkos::abort( msg.c_str() );
    }
-    if ( compiled_major != cudaProp.major || compiled_minor != cudaProp.minor ) {
+    if ( Kokkos::show_warnings() && (compiled_major != cudaProp.major || compiled_minor != cudaProp.minor) ) {
      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
                << compiled_major << "." << compiled_minor
                << " on device with compute capability "
@ -467,7 +467,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )

    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;

-    if ( ! m_scratchUnifiedSupported ) {
+    if ( Kokkos::show_warnings() && ! m_scratchUnifiedSupported ) {
      std::cout << "Kokkos::Cuda device "
                << cudaProp.name << " capability "
                << cudaProp.major << "." << cudaProp.minor
@ -545,7 +545,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
  }

  #ifdef KOKKOS_ENABLE_CUDA_UVM
-    if(!cuda_launch_blocking()) {
+    if( Kokkos::show_warnings() && !cuda_launch_blocking() ) {
      std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
      std::cout << "                                  without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
      std::cout << "                                  The code must call Cuda::fence() after each kernel" << std::endl;
@ -561,7 +561,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
    bool visible_devices_one=true;
    if (env_visible_devices == 0) visible_devices_one=false;

-    if(!visible_devices_one && !force_device_alloc) {
+    if( Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc) ) {
      std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
      std::cout << "                                  without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
      std::cout << "                                  setting CUDA_VISIBLE_DEVICES." << std::endl;
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@ -381,12 +381,12 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType
-                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::MDRangePolicy< Traits ... >
                 , Kokkos::Cuda
                 >
 {
 private:
-  typedef Kokkos::Experimental::MDRangePolicy< Traits ...  > Policy ;
+  typedef Kokkos::MDRangePolicy< Traits ...  > Policy ;
  using RP = Policy;
  typedef typename Policy::array_index_type array_index_type;
  typedef typename Policy::index_type index_type;
@ -402,7 +402,7 @@ public:
  __device__
  void operator()(void) const
    {
-      Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
+      Kokkos::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
    }


@ -858,14 +858,14 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
-                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , Kokkos::MDRangePolicy< Traits ... >
                    , ReducerType
                    , Kokkos::Cuda
                    >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > Policy ;
  typedef typename Policy::array_index_type                 array_index_type;
  typedef typename Policy::index_type                       index_type;

@ -898,7 +898,7 @@ public:
  size_type *         m_scratch_flags ;
  size_type *         m_unified_space ;

-  typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
+  typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;

  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
@ -913,7 +913,7 @@ public:
  void
  exec_range( reference_type update ) const
  {
-    Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
+    Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
  }

  inline
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -127,11 +127,11 @@ struct CudaTextureFetch {
  template< class CudaMemorySpace >
  inline explicit
  CudaTextureFetch( const ValueType * const arg_ptr
-                  , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
+                  , Kokkos::Impl::SharedAllocationRecord< CudaMemorySpace , void > * record
                  )
-    : m_obj( record.template attach_texture_object< AliasType >() )
+    : m_obj( record->template attach_texture_object< AliasType >() )
    , m_ptr( arg_ptr )
-    , m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
+    , m_offset( record->attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
    {}

  // Texture object spans the entire allocation.
@ -199,8 +199,8 @@ struct CudaLDGFetch {
  template< class CudaMemorySpace >
  inline explicit
  CudaLDGFetch( const ValueType * const arg_ptr
-                  , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
-                  )
+              , Kokkos::Impl::SharedAllocationRecord<CudaMemorySpace,void>*
+              )
    : m_ptr( arg_ptr )
    {}

@ -285,7 +285,21 @@ public:
      // Assignment of texture = non-texture requires creation of a texture object
      // which can only occur on the host.  In addition, 'get_record' is only valid
      // if called in a host execution space
-      return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
+
+
+      typedef typename Traits::memory_space memory_space ;
+      typedef typename Impl::SharedAllocationRecord<memory_space,void> record ;
+
+      record * const r = arg_tracker.template get_record< memory_space >();
+
+#if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
+      if ( 0 == r ) {
+        Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
+      }
+#endif
+
+      return handle_type( arg_data_ptr , r );
+
 #else
      Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
      return handle_type();
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@ -48,50 +48,52 @@ namespace Kokkos {
 namespace Impl {

 template< class FunctorType , class ... Traits >
-class ParallelFor< FunctorType ,
-                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
-                   Kokkos::Cuda
+class ParallelFor< FunctorType
+                 , Kokkos::WorkGraphPolicy< Traits ... >
+                 , Kokkos::Cuda
                 >
-  : public Kokkos::Impl::Experimental::
-           WorkGraphExec< FunctorType,
-                          Kokkos::Cuda,
-                          Traits ...
-                        >
 {
 public:

-  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... >   Policy ;
-  typedef Kokkos::Impl::Experimental::
-          WorkGraphExec<FunctorType, Kokkos::Cuda, Traits ... > Base ;
+  typedef Kokkos::WorkGraphPolicy< Traits ... >   Policy ;
  typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda>        Self ;

 private:

-  template< class TagType >
-  __device__
-  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    Base::m_functor( i );
-  }
+  Policy       m_policy ;
+  FunctorType  m_functor ;

  template< class TagType >
-  __device__
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }
+
+  template< class TagType >
+  __device__ inline
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    const TagType t{} ;
-    Base::m_functor( t , i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{} ; m_functor( t , w ); }

 public:

-  __device__
-  inline
-  void operator()() const {
-    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
-      exec_one< typename Policy::work_tag >( i );
-      Base::after_work(i);
+  __device__ inline
+  void operator()() const noexcept
+    {
+      if ( 0 == ( threadIdx.y % 16 ) ) {
+
+        // Spin until COMPLETED_TOKEN.
+        // END_TOKEN indicates no work is currently available.
+
+        for ( std::int32_t w = Policy::END_TOKEN ;
+              Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+          if ( Policy::END_TOKEN != w ) {
+            exec_one< typename Policy::work_tag >( w );
+            m_policy.completed_work(w);
+          }
+        }
+      }
    }
-  }

  inline
  void execute()
@ -108,9 +110,9 @@ public:
  inline
  ParallelFor( const FunctorType & arg_functor
             , const Policy      & arg_policy )
-    : Base( arg_functor, arg_policy )
-  {
-  }
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+  {}
 };

 } // namespace Impl
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@ -55,7 +55,7 @@
 #include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
 #endif

-namespace Kokkos { namespace Experimental {
+namespace Kokkos {

 // ------------------------------------------------------------------ //

@ -331,11 +331,23 @@ struct MDRangePolicy
  }

 };
+
+} // namespace Kokkos
+
+// For backward compatibility
+namespace Kokkos { namespace Experimental {
+  using Kokkos::MDRangePolicy;
+  using Kokkos::Rank;
+  using Kokkos::Iterate;
+} } // end Kokkos::Experimental
 // ------------------------------------------------------------------ //

 // ------------------------------------------------------------------ //
 //md_parallel_for - deprecated use parallel_for
 // ------------------------------------------------------------------ //
+
+namespace Kokkos { namespace Experimental {
+
 template <typename MDRange, typename Functor, typename Enable = void>
 void md_parallel_for( MDRange const& range
                    , Functor const& f
@ -347,7 +359,7 @@ void md_parallel_for( MDRange const& range
                      ) >::type* = 0
                    )
 {
-  Impl::MDFunctor<MDRange, Functor, void> g(range, f);
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);

  using range_policy = typename MDRange::impl_range_policy;

@ -365,7 +377,7 @@ void md_parallel_for( const std::string& str
                      ) >::type* = 0
                    )
 {
-  Impl::MDFunctor<MDRange, Functor, void> g(range, f);
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);

  using range_policy = typename MDRange::impl_range_policy;

@ -385,7 +397,7 @@ void md_parallel_for( const std::string& str
                      ) >::type* = 0
                    )
 {
-  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
  closure.execute();
 }

@ -400,7 +412,7 @@ void md_parallel_for( MDRange const& range
                      ) >::type* = 0
                    )
 {
-  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
  closure.execute();
 }
 #endif
@ -421,7 +433,7 @@ void md_parallel_reduce( MDRange const& range
                      ) >::type* = 0
                    )
 {
-  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);

  using range_policy = typename MDRange::impl_range_policy;
  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
@ -439,7 +451,7 @@ void md_parallel_reduce( const std::string& str
                      ) >::type* = 0
                    )
 {
-  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);

  using range_policy = typename MDRange::impl_range_policy;

@ -448,7 +460,7 @@ void md_parallel_reduce( const std::string& str

 // Cuda - md_parallel_reduce not implemented - use parallel_reduce

-}} // namespace Kokkos::Experimental
+} } // namespace Kokkos::Experimental

 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP

--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@ -81,10 +81,10 @@ struct IndexType

 /**\brief Specify Launch Bounds for CUDA execution.
 *
- *  The "best" defaults may be architecture specific.
+ *  If no launch bounds specified then do not set launch bounds.
 */
-template< unsigned int maxT = 1024 /* Max threads per block */
-        , unsigned int minB = 1    /* Min blocks per SM */
+template< unsigned int maxT = 0 /* Max threads per block */
+        , unsigned int minB = 0 /* Min blocks per SM */
        >
 struct LaunchBounds
 {
@ -280,6 +280,9 @@ struct MemorySpaceAccess {
  enum { deepcopy = assignable };
 };

+}} // namespace Kokkos::Impl
+
+namespace Kokkos {

 /**\brief  Can AccessSpace access MemorySpace ?
 *
@ -358,6 +361,13 @@ public:
    >::type  space ;
 };

+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+
+using Kokkos::SpaceAccessibility ; // For backward compatibility
+
 }} // namespace Kokkos::Impl

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -99,13 +99,17 @@ struct InitArguments {
  int num_threads;
  int num_numa;
  int device_id;
+  bool disable_warnings;

  InitArguments( int nt = -1
               , int nn = -1
-               , int dv = -1)
-    : num_threads( nt )
-    , num_numa( nn )
-    , device_id( dv )
+               , int dv = -1
+               , bool dw = false
+               )
+    : num_threads{ nt }
+    , num_numa{ nn }
+    , device_id{ dv }
+    , disable_warnings{ dw }
  {}
 };

@ -113,6 +117,10 @@ void initialize(int& narg, char* arg[]);

 void initialize(const InitArguments& args = InitArguments());

+bool is_initialized() noexcept;
+
+bool show_warnings() noexcept;
+
 /** \brief  Finalize the spaces that were initialized via Kokkos::initialize */
 void finalize();

--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@ -45,7 +45,6 @@
 #define KOKKOS_CRS_HPP

 namespace Kokkos {
-namespace Experimental {

 /// \class Crs
 /// \brief Compressed row storage array.
@ -164,7 +163,7 @@ void transpose_crs(
    Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in);

-}} // namespace Kokkos::Experimental
+} // namespace Kokkos

 /*--------------------------------------------------------------------------*/

@ -172,7 +171,6 @@ void transpose_crs(

 namespace Kokkos {
 namespace Impl {
-namespace Experimental {

 template <class InCrs, class OutCounts>
 class GetCrsTransposeCounts {
@ -277,14 +275,13 @@ class FillCrsTransposeEntries {
  }
 };

-}}} // namespace Kokkos::Impl::Experimental
+}} // namespace Kokkos::Impl

 /*--------------------------------------------------------------------------*/

 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
-namespace Experimental {

 template< class OutCounts,
          class DataType,
@ -297,8 +294,7 @@ void get_crs_transpose_counts(
    std::string const& name) {
  using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>;
  out = OutCounts(name, in.numRows());
-  Kokkos::Impl::Experimental::
-    GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
+  Kokkos::Impl::GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
 }

 template< class OutRowMap,
@ -308,8 +304,7 @@ typename OutRowMap::value_type get_crs_row_map_from_counts(
    InCounts const& in,
    std::string const& name) {
  out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
-  Kokkos::Impl::Experimental::
-    CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
+  Kokkos::Impl::CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
  return functor.execute();
 }

@ -326,32 +321,37 @@ void transpose_crs(
  typedef View<SizeType*, memory_space>               counts_type ;
  {
  counts_type counts;
-  Kokkos::Experimental::get_crs_transpose_counts(counts, in);
-  Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts,
+  Kokkos::get_crs_transpose_counts(counts, in);
+  Kokkos::get_crs_row_map_from_counts(out.row_map, counts,
      "tranpose_row_map");
  }
  out.entries = decltype(out.entries)("transpose_entries", in.entries.size());
-  Kokkos::Impl::Experimental::
+  Kokkos::Impl::
    FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
 }

 template< class CrsType,
-          class Functor>
-struct CountAndFill {
+          class Functor,
+          class ExecutionSpace = typename CrsType::execution_space>
+struct CountAndFillBase;
+
+template< class CrsType,
+          class Functor,
+          class ExecutionSpace>
+struct CountAndFillBase {
  using data_type = typename CrsType::size_type;
  using size_type = typename CrsType::size_type;
  using row_map_type = typename CrsType::row_map_type;
-  using entries_type = typename CrsType::entries_type;
  using counts_type = row_map_type;
  CrsType m_crs;
  Functor m_functor;
  counts_type m_counts;
  struct Count {};
-  KOKKOS_INLINE_FUNCTION void operator()(Count, size_type i) const {
+  inline void operator()(Count, size_type i) const {
    m_counts(i) = m_functor(i, nullptr);
  }
  struct Fill {};
-  KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
+  inline void operator()(Fill, size_type i) const {
    auto j = m_crs.row_map(i);
    /* we don't want to access entries(entries.size()), even if its just to get its
       address and never use it.
@ -363,13 +363,63 @@ struct CountAndFill {
      nullptr : (&(m_crs.entries(j)));
    m_functor(i, fill);
  }
-  using self_type = CountAndFill<CrsType, Functor>;
-  CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
+  CountAndFillBase(CrsType& crs, Functor const& f):
    m_crs(crs),
    m_functor(f)
+  {}
+};
+
+#if defined( KOKKOS_ENABLE_CUDA )
+template< class CrsType,
+          class Functor>
+struct CountAndFillBase<CrsType, Functor, Kokkos::Cuda> {
+  using data_type = typename CrsType::size_type;
+  using size_type = typename CrsType::size_type;
+  using row_map_type = typename CrsType::row_map_type;
+  using counts_type = row_map_type;
+  CrsType m_crs;
+  Functor m_functor;
+  counts_type m_counts;
+  struct Count {};
+  __device__ inline void operator()(Count, size_type i) const {
+    m_counts(i) = m_functor(i, nullptr);
+  }
+  struct Fill {};
+  __device__ inline void operator()(Fill, size_type i) const {
+    auto j = m_crs.row_map(i);
+    /* we don't want to access entries(entries.size()), even if its just to get its
+       address and never use it.
+       this can happen when row (i) is empty and all rows after it are also empty.
+       we could compare to row_map(i + 1), but that is a read from global memory,
+       whereas dimension_0() should be part of the View in registers (or constant memory) */
+    data_type* fill =
+      (j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
+      nullptr : (&(m_crs.entries(j)));
+    m_functor(i, fill);
+  }
+  CountAndFillBase(CrsType& crs, Functor const& f):
+    m_crs(crs),
+    m_functor(f)
+  {}
+};
+#endif
+
+template< class CrsType,
+          class Functor>
+struct CountAndFill : public CountAndFillBase<CrsType, Functor> {
+  using base_type = CountAndFillBase<CrsType, Functor>;
+  using typename base_type::data_type;
+  using typename base_type::size_type;
+  using typename base_type::counts_type;
+  using typename base_type::Count;
+  using typename base_type::Fill;
+  using entries_type = typename CrsType::entries_type;
+  using self_type = CountAndFill<CrsType, Functor>;
+  CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
+    base_type(crs, f)
  {
    using execution_space = typename CrsType::execution_space;
-    m_counts = counts_type("counts", nrows);
+    this->m_counts = counts_type("counts", nrows);
    {
    using count_policy_type = RangePolicy<size_type, execution_space, Count>;
    using count_closure_type =
@ -377,10 +427,10 @@ struct CountAndFill {
    const count_closure_type closure(*this, count_policy_type(0, nrows));
    closure.execute();
    }
-    auto nentries = Kokkos::Experimental::
-      get_crs_row_map_from_counts(m_crs.row_map, m_counts);
-    m_counts = counts_type();
-    m_crs.entries = entries_type("entries", nentries);
+    auto nentries = Kokkos::
+      get_crs_row_map_from_counts(this->m_crs.row_map, this->m_counts);
+    this->m_counts = counts_type();
+    this->m_crs.entries = entries_type("entries", nentries);
    {
    using fill_policy_type = RangePolicy<size_type, execution_space, Fill>;
    using fill_closure_type =
@ -388,7 +438,7 @@ struct CountAndFill {
    const fill_closure_type closure(*this, fill_policy_type(0, nrows));
    closure.execute();
    }
-    crs = m_crs;
+    crs = this->m_crs;
  }
 };

@ -398,9 +448,9 @@ void count_and_fill_crs(
    CrsType& crs,
    typename CrsType::size_type nrows,
    Functor const& f) {
-  Kokkos::Experimental::CountAndFill<CrsType, Functor>(crs, nrows, f);
+  Kokkos::CountAndFill<CrsType, Functor>(crs, nrows, f);
 }

-}} // namespace Kokkos::Experimental
+} // namespace Kokkos

 #endif /* #define KOKKOS_CRS_HPP */
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@ -379,12 +379,13 @@ Impl::PerThreadValue PerThread(const int& arg);
 *  uses variadic templates. Each and any of the template arguments can
 *  be omitted.
 *
- *  Possible Template arguments and there default values:
+ *  Possible Template arguments and their default values:
 *    ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be enabled.
 *    WorkTag (none): Tag which is used as the first argument for the functor operator.
 *    Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
 *    IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
- *    LaunchBounds<int,int> (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation.
+ *    LaunchBounds<unsigned,unsigned> Launch Bounds for CUDA compilation,
+ *    default of LaunchBounds<0,0> indicates no launch bounds specified.
 */
 template< class ... Properties>
 class TeamPolicy: public
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -274,18 +274,14 @@
    #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
  #endif

+  #if ! defined( KOKKOS_MEMORY_ALIGNMENT )
+    #define KOKKOS_MEMORY_ALIGNMENT 64
+  #endif
+
  #define KOKKOS_RESTRICT __restrict__

-  #ifndef KOKKOS_ALIGN
-    #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
-  #endif
-
-  #ifndef KOKKOS_ALIGN_PTR
-    #define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size)))
-  #endif
-
-  #ifndef KOKKOS_ALIGN_SIZE
-    #define KOKKOS_ALIGN_SIZE 64
+  #ifndef KOKKOS_IMPL_ALIGN_PTR
+    #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size)))
  #endif

  #if ( 1400 > KOKKOS_COMPILER_INTEL )
@ -351,6 +347,11 @@
  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
    #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
  #endif
+
+  #if !defined( KOKKOS_IMPL_ALIGN_PTR )
+    #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((aligned(size)))
+  #endif
+
 #endif

 //----------------------------------------------------------------------------
@ -426,16 +427,16 @@
 //----------------------------------------------------------------------------
 // Define Macro for alignment:

-#if !defined KOKKOS_ALIGN_SIZE
-  #define KOKKOS_ALIGN_SIZE 16
+#if ! defined( KOKKOS_MEMORY_ALIGNMENT )
+  #define KOKKOS_MEMORY_ALIGNMENT 16
 #endif

-#if !defined( KOKKOS_ALIGN )
-  #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
+#if ! defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
+  #define KOKKOS_MEMORY_ALIGNMENT_THRESHOLD 4
 #endif

-#if !defined( KOKKOS_ALIGN_PTR )
-  #define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
+#if !defined( KOKKOS_IMPL_ALIGN_PTR )
+  #define KOKKOS_IMPL_ALIGN_PTR(size) /* */
 #endif

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@ -111,6 +111,10 @@ private:

 public:

+  /**\brief  The maximum size of a superblock and block */
+  enum : uint32_t { max_superblock_size      = 1LU << 31 /* 2 gigabytes */ };
+  enum : uint32_t { max_block_per_superblock = max_bit_count };
+
  //--------------------------------------------------------------------------

  KOKKOS_INLINE_FUNCTION
@ -206,7 +210,7 @@ public:
      const uint32_t * sb_state_ptr = sb_state_array ;

      s << "pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) << ")"
-        << " superblock_size(" << ( 1 << m_sb_size_lg2 ) << ")" << std::endl ;
+        << " superblock_size(" << ( 1LU << m_sb_size_lg2 ) << ")" << std::endl ;

      for ( int32_t i = 0 ; i < m_sb_count
          ; ++i , sb_state_ptr += m_sb_state_size ) {
@ -215,7 +219,7 @@ public:

          const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
          const uint32_t block_size_lg2  = m_sb_size_lg2 - block_count_lg2 ;
-          const uint32_t block_count     = 1 << block_count_lg2 ;
+          const uint32_t block_count     = 1u << block_count_lg2 ;
          const uint32_t block_used      = (*sb_state_ptr) & state_used_mask ;

          s << "Superblock[ " << i << " / " << m_sb_count << " ] {"
@ -284,43 +288,87 @@ public:
    {
      const uint32_t int_align_lg2   = 3 ; /* align as int[8] */
      const uint32_t int_align_mask  = ( 1u << int_align_lg2 ) - 1 ;
+      const uint32_t default_min_block_size       = 1u << 6  ; /* 64 bytes */
+      const uint32_t default_max_block_size       = 1u << 12 ;/* 4k bytes */
+      const uint32_t default_min_superblock_size  = 1u << 20 ;/* 1M bytes */

-      // Constraints and defaults:
-      //   min_block_alloc_size <= max_block_alloc_size
-      //   max_block_alloc_size <= min_superblock_size 
-      //   min_superblock_size  <= min_total_alloc_size
+      //--------------------------------------------------
+      // Default block and superblock sizes:

-      const uint32_t MIN_BLOCK_SIZE  = 1u << 6   /*   64 bytes */ ;
-      const uint32_t MAX_BLOCK_SIZE  = 1u << 12  /*   4k bytes */ ;
+      if ( 0 == min_block_alloc_size ) {
+        // Default all sizes:

-      if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ;
+        min_superblock_size =
+          std::min( size_t(default_min_superblock_size)
+                  , min_total_alloc_size );
+
+        min_block_alloc_size =
+          std::min( size_t(default_min_block_size)
+                  , min_superblock_size );
+
+        max_block_alloc_size =
+          std::min( size_t(default_max_block_size)
+                  , min_superblock_size );
+      }
+      else if ( 0 == min_superblock_size ) {
+
+        // Choose superblock size as minimum of:
+        //   max_block_per_superblock * min_block_size
+        //   max_superblock_size
+        //   min_total_alloc_size
+
+        const size_t max_superblock =
+          min_block_alloc_size * max_block_per_superblock ;
+
+        min_superblock_size =
+          std::min( max_superblock ,
+          std::min( size_t(max_superblock_size)
+                  , min_total_alloc_size ) );
+      }

      if ( 0 == max_block_alloc_size ) {
-
-        max_block_alloc_size = MAX_BLOCK_SIZE ;
-
-        // Upper bound of total allocation size
-        max_block_alloc_size = std::min( size_t(max_block_alloc_size)
-                                       , min_total_alloc_size );
-
-        // Lower bound of minimum block size
-        max_block_alloc_size = std::max( max_block_alloc_size
-                                       , min_block_alloc_size );
+        max_block_alloc_size = min_superblock_size ;
      }

-      if ( 0 == min_superblock_size ) {
-        min_superblock_size = max_block_alloc_size ;
+      //--------------------------------------------------

-        // Upper bound of total allocation size
-        min_superblock_size = std::min( size_t(min_superblock_size)
-                                      , min_total_alloc_size );
+      {
+        /* Enforce size constraints:
+         *   min_block_alloc_size <= max_block_alloc_size
+         *   max_block_alloc_size <= min_superblock_size 
+         *   min_superblock_size  <= max_superblock_size
+         *   min_superblock_size  <= min_total_alloc_size
+         *   min_superblock_size  <= min_block_alloc_size * 
+         *                           max_block_per_superblock
+         */

-        // Lower bound of maximum block size
-        min_superblock_size = std::max( min_superblock_size
-                                      , max_block_alloc_size );
+        const size_t max_superblock =
+          min_block_alloc_size * max_block_per_superblock ;
+
+        if ( ( size_t(max_superblock_size) < min_superblock_size ) ||
+             ( min_total_alloc_size < min_superblock_size ) ||
+             ( max_superblock       < min_superblock_size ) ||
+             ( min_superblock_size  < max_block_alloc_size ) ||
+             ( max_block_alloc_size < min_block_alloc_size ) ) {
+
+#if 1
+  printf( "  MemoryPool min_block_alloc_size(%ld) max_block_alloc_size(%ld) min_superblock_size(%ld) min_total_alloc_size(%ld) ; max_superblock_size(%ld) max_block_per_superblock(%ld)\n"
+        , min_block_alloc_size
+        , max_block_alloc_size
+        , min_superblock_size
+        , min_total_alloc_size
+        , size_t(max_superblock_size)
+        , size_t(max_block_per_superblock)
+        );
+#endif
+
+          Kokkos::abort("Kokkos MemoryPool size constraint violation");
+        }
      }

+      //--------------------------------------------------
      // Block and superblock size is power of two:
+      // Maximum value is 'max_superblock_size'

      m_min_block_size_lg2 =
        Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
@ -331,45 +379,26 @@ public:
      m_sb_size_lg2 =
        Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);

-      // Constraints:
-      // m_min_block_size_lg2 <= m_max_block_size_lg2 <= m_sb_size_lg2
-      // m_sb_size_lg2 <= m_min_block_size + max_bit_count_lg2
+      {
+        // number of superblocks is multiple of superblock size that
+        // can hold min_total_alloc_size.

-      if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_sb_size_lg2 ) {
-        m_min_block_size_lg2 = m_sb_size_lg2 - max_bit_count_lg2 ;
-      }
-      if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_max_block_size_lg2 ) {
-        m_min_block_size_lg2 = m_max_block_size_lg2 - max_bit_count_lg2 ;
-      }
-      if ( m_max_block_size_lg2 < m_min_block_size_lg2 ) {
-        m_max_block_size_lg2 = m_min_block_size_lg2 ;
-      }
-      if ( m_sb_size_lg2 < m_max_block_size_lg2 ) {
-        m_sb_size_lg2 = m_max_block_size_lg2 ;
+        const uint64_t sb_size_mask = ( 1LU << m_sb_size_lg2 ) - 1 ;
+
+        m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
      }

-      // At least 32 minimum size blocks in a superblock
+      {
+        // Any superblock can be assigned to the smallest size block
+        // Size the block bitset to maximum number of blocks

-      if ( m_sb_size_lg2 < m_min_block_size_lg2 + 5 ) {
-        m_sb_size_lg2 = m_min_block_size_lg2 + 5 ;
+        const uint32_t max_block_count_lg2 =
+          m_sb_size_lg2 - m_min_block_size_lg2 ;
+
+        m_sb_state_size =
+          ( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
      }

-      // number of superblocks is multiple of superblock size that
-      // can hold min_total_alloc_size.
-
-      const uint32_t sb_size_mask = ( 1u << m_sb_size_lg2 ) - 1 ;
-
-      m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
-
-      // Any superblock can be assigned to the smallest size block
-      // Size the block bitset to maximum number of blocks
-
-      const uint32_t max_block_count_lg2 =
-        m_sb_size_lg2 - m_min_block_size_lg2 ;
-
-      m_sb_state_size =
-        ( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
-
      // Array of all superblock states

      const size_t all_sb_state_size =
@ -454,7 +483,7 @@ private:
   * Restrict lower bound to minimum block size.
   */
  KOKKOS_FORCEINLINE_FUNCTION
-  unsigned get_block_size_lg2( unsigned n ) const noexcept
+  uint32_t get_block_size_lg2( uint32_t n ) const noexcept
    {
      const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n );

@ -463,11 +492,12 @@ private:

 public:

+  /* Return 0 for invalid block size */
  KOKKOS_INLINE_FUNCTION
-  uint32_t allocate_block_size( uint32_t alloc_size ) const noexcept
+  uint32_t allocate_block_size( uint64_t alloc_size ) const noexcept
    {
      return alloc_size <= (1UL << m_max_block_size_lg2)
-           ? ( 1u << get_block_size_lg2( alloc_size ) )
+           ? ( 1UL << get_block_size_lg2( uint32_t(alloc_size) ) )
           : 0 ;
    }

@ -485,246 +515,253 @@ public:
  void * allocate( size_t alloc_size
                 , int32_t attempt_limit = 1 ) const noexcept
    {
+      if ( size_t(1LU << m_max_block_size_lg2) < alloc_size ) {
+        Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
+      }
+
      if ( 0 == alloc_size ) return (void*) 0 ;

      void * p = 0 ;

      const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );

-      if ( block_size_lg2 <= m_max_block_size_lg2 ) {
+      // Allocation will fit within a superblock
+      // that has block sizes ( 1 << block_size_lg2 )

-        // Allocation will fit within a superblock
-        // that has block sizes ( 1 << block_size_lg2 )
+      const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
+      const uint32_t block_state     = block_count_lg2 << state_shift ;
+      const uint32_t block_count     = 1u << block_count_lg2 ;

-        const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
-        const uint32_t block_state     = block_count_lg2 << state_shift ;
-        const uint32_t block_count     = 1u << block_count_lg2 ;
+      // Superblock hints for this block size:
+      //   hint_sb_id_ptr[0] is the dynamically changing hint
+      //   hint_sb_id_ptr[1] is the static start point

-        // Superblock hints for this block size:
-        //   hint_sb_id_ptr[0] is the dynamically changing hint
-        //   hint_sb_id_ptr[1] is the static start point
+      volatile uint32_t * const hint_sb_id_ptr
+        = m_sb_state_array     /* memory pool state array */
+        + m_hint_offset        /* offset to hint portion of array */
+        + HINT_PER_BLOCK_SIZE  /* number of hints per block size */
+          * ( block_size_lg2 - m_min_block_size_lg2 ); /* block size id */

-        volatile uint32_t * const hint_sb_id_ptr
-          = m_sb_state_array     /* memory pool state array */
-          + m_hint_offset        /* offset to hint portion of array */
-          + HINT_PER_BLOCK_SIZE  /* number of hints per block size */
-            * ( block_size_lg2 - m_min_block_size_lg2 ); /* block size id */
+      const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );

-        const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );
+      // Fast query clock register 'tic' to pseudo-randomize
+      // the guess for which block within a superblock should
+      // be claimed.  If not available then a search occurs.

-        // Fast query clock register 'tic' to pseudo-randomize
-        // the guess for which block within a superblock should
-        // be claimed.  If not available then a search occurs.
-
-        const uint32_t block_id_hint =
-          (uint32_t)( Kokkos::Impl::clock_tic()
+      const uint32_t block_id_hint =
+        (uint32_t)( Kokkos::Impl::clock_tic()
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
-          // Spread out potentially concurrent access
-          // by threads within a warp or thread block.
-          + ( threadIdx.x + blockDim.x * threadIdx.y )
+        // Spread out potentially concurrent access
+        // by threads within a warp or thread block.
+        + ( threadIdx.x + blockDim.x * threadIdx.y )
 #endif
-          );
+        );

-        // expected state of superblock for allocation
-        uint32_t sb_state = block_state ;
+      // expected state of superblock for allocation
+      uint32_t sb_state = block_state ;

-        int32_t sb_id = -1 ;
+      int32_t sb_id = -1 ;

-        volatile uint32_t * sb_state_array = 0 ;
+      volatile uint32_t * sb_state_array = 0 ;

-        while ( attempt_limit ) {
+      while ( attempt_limit ) {

-          int32_t hint_sb_id = -1 ;
+        int32_t hint_sb_id = -1 ;

-          if ( sb_id < 0 ) {
+        if ( sb_id < 0 ) {

-            // No superblock specified, try the hint for this block size
+          // No superblock specified, try the hint for this block size

-            sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
+          sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
+
+          sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
+        }
+
+        // Require:
+        //   0 <= sb_id
+        //   sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
+
+        if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
+
+          // This superblock state is as expected, for the moment.
+          // Attempt to claim a bit.  The attempt updates the state
+          // so have already made sure the state header is as expected.
+
+          const uint32_t count_lg2 = sb_state >> state_shift ;
+          const uint32_t mask      = ( 1u << count_lg2 ) - 1 ;
+
+          const Kokkos::pair<int,int> result =
+            CB::acquire_bounded_lg2( sb_state_array
+                                   , count_lg2
+                                   , block_id_hint & mask
+                                   , sb_state
+                                   );
+
+          // If result.first < 0 then failed to acquire
+          // due to either full or buffer was wrong state.
+          // Could be wrong state if a deallocation raced the
+          // superblock to empty before the acquire could succeed.
+
+          if ( 0 <= result.first ) { // acquired a bit
+
+            const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
+
+            // Set the allocated block pointer
+
+            p = ((char*)( m_sb_state_array + m_data_offset ))
+              + ( uint64_t(sb_id) << m_sb_size_lg2 ) // superblock memory
+              + ( uint64_t(result.first) << size_lg2 ); // block memory
+
+#if 0
+  printf( "  MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
+        , (uintptr_t)m_sb_state_array
+        , (uintptr_t)p
+        , alloc_size
+        , sb_id
+        , sb_state 
+        , (1u << size_lg2)
+        , (1u << count_lg2)
+        , result.first 
+        , result.second );
+#endif
+
+            break ; // Success
+          }
+        }
+        //------------------------------------------------------------------
+        //  Arrive here if failed to acquire a block.
+        //  Must find a new superblock.
+
+        //  Start searching at designated index for this block size.
+        //  Look for superblock that, in preferential order,
+        //  1) part-full superblock of this block size
+        //  2) empty superblock to claim for this block size
+        //  3) part-full superblock of the next larger block size
+
+        sb_state = block_state ; // Expect to find the desired state
+        sb_id = -1 ;
+
+        bool update_hint = false ;
+        int32_t sb_id_empty = -1 ;
+        int32_t sb_id_large = -1 ;
+        uint32_t sb_state_large = 0 ;
+
+        sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
+
+        for ( int32_t i = 0 , id = sb_id_begin ; i < m_sb_count ; ++i ) {
+
+          //  Query state of the candidate superblock.
+          //  Note that the state may change at any moment
+          //  as concurrent allocations and deallocations occur.
+          
+          const uint32_t full_state = *sb_state_array ;
+          const uint32_t used       = full_state & state_used_mask ;
+          const uint32_t state      = full_state & state_header_mask ;
+
+          if ( state == block_state ) {
+
+            //  Superblock is assigned to this block size
+
+            if ( used < block_count ) {
+
+              // There is room to allocate one block
+
+              sb_id = id ;
+
+              // Is there room to allocate more than one block?
+
+              update_hint = used + 1 < block_count ;
+
+              break ;
+            }
+          }
+          else if ( 0 == used ) {
+
+            // Superblock is empty
+
+            if ( -1 == sb_id_empty ) {
+
+              // Superblock is not assigned to this block size
+              // and is the first empty superblock encountered.
+              // Save this id to use if a partfull superblock is not found.
+
+              sb_id_empty = id ;
+            }
+          }
+          else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
+                    ( -1 == sb_id_large /* have not found a larger */ ) &&
+                    ( state < block_state /* a larger block */ ) &&
+                    // is not full:
+                    ( used < ( 1u << ( state >> state_shift ) ) ) ) {
+            //  First superblock encountered that is
+            //  larger than this block size and
+            //  has room for an allocation.
+            //  Save this id to use of partfull or empty superblock not found
+            sb_id_large    = id ;
+            sb_state_large = state ;
+          }
+
+          // Iterate around the superblock array:
+
+          if ( ++id < m_sb_count ) {
+            sb_state_array += m_sb_state_size ;
+          }
+          else {
+            id = 0 ;
+            sb_state_array = m_sb_state_array ;
+          }
+        }
+
+ // printf("  search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
+
+        if ( sb_id < 0 ) {
+
+          //  Did not find a partfull superblock for this block size.
+
+          if ( 0 <= sb_id_empty ) {
+
+            //  Found first empty superblock following designated superblock
+            //  Attempt to claim it for this block size.
+            //  If the claim fails assume that another thread claimed it
+            //  for this block size and try to use it anyway,
+            //  but do not update hint.
+
+            sb_id = sb_id_empty ;
+
+            sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
+
+            //  If successfully changed assignment of empty superblock 'sb_id'
+            //  to this block_size then update the hint.
+
+            const uint32_t state_empty = state_header_mask & *sb_state_array ;
+
+            // If this thread claims the empty block then update the hint
+            update_hint =
+              state_empty ==
+                Kokkos::atomic_compare_exchange
+                  (sb_state_array,state_empty,block_state);
+          }
+          else if ( 0 <= sb_id_large ) {
+
+            // Found a larger superblock with space available
+
+            sb_id    = sb_id_large ;
+            sb_state = sb_state_large ;

            sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
          }
-
-          // Require:
-          //   0 <= sb_id
-          //   sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
-
-          if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
-
-            // This superblock state is as expected, for the moment.
-            // Attempt to claim a bit.  The attempt updates the state
-            // so have already made sure the state header is as expected.
-
-            const uint32_t count_lg2 = sb_state >> state_shift ;
-            const uint32_t mask      = ( 1u << count_lg2 ) - 1 ;
-
-            const Kokkos::pair<int,int> result =
-              CB::acquire_bounded_lg2( sb_state_array
-                                     , count_lg2
-                                     , block_id_hint & mask
-                                     , sb_state
-                                     );
-
-            // If result.first < 0 then failed to acquire
-            // due to either full or buffer was wrong state.
-            // Could be wrong state if a deallocation raced the
-            // superblock to empty before the acquire could succeed.
-
-            if ( 0 <= result.first ) { // acquired a bit
-
-              const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
-
-              // Set the allocated block pointer
-
-              p = ((char*)( m_sb_state_array + m_data_offset ))
-                + ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory
-                + ( result.first    << size_lg2 );     // block memory
-
-              break ; // Success
-            }
-
-// printf("  acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second );
-
+          else {
+            // Did not find a potentially usable superblock
+            --attempt_limit ;
          }
-          //------------------------------------------------------------------
-          //  Arrive here if failed to acquire a block.
-          //  Must find a new superblock.
+        }

-          //  Start searching at designated index for this block size.
-          //  Look for superblock that, in preferential order,
-          //  1) part-full superblock of this block size
-          //  2) empty superblock to claim for this block size
-          //  3) part-full superblock of the next larger block size
-
-          sb_state = block_state ; // Expect to find the desired state
-          sb_id = -1 ;
-
-          bool update_hint = false ;
-          int32_t sb_id_empty = -1 ;
-          int32_t sb_id_large = -1 ;
-          uint32_t sb_state_large = 0 ;
-
-          sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
-
-          for ( int32_t i = 0 , id = sb_id_begin ; i < m_sb_count ; ++i ) {
-
-            //  Query state of the candidate superblock.
-            //  Note that the state may change at any moment
-            //  as concurrent allocations and deallocations occur.
-            
-            const uint32_t full_state = *sb_state_array ;
-            const uint32_t used       = full_state & state_used_mask ;
-            const uint32_t state      = full_state & state_header_mask ;
-
-            if ( state == block_state ) {
-
-              //  Superblock is assigned to this block size
-
-              if ( used < block_count ) {
-
-                // There is room to allocate one block
-
-                sb_id = id ;
-
-                // Is there room to allocate more than one block?
-
-                update_hint = used + 1 < block_count ;
-
-                break ;
-              }
-            }
-            else if ( 0 == used ) {
-
-              // Superblock is empty
-
-              if ( -1 == sb_id_empty ) {
-
-                // Superblock is not assigned to this block size
-                // and is the first empty superblock encountered.
-                // Save this id to use if a partfull superblock is not found.
-
-                sb_id_empty = id ;
-              }
-            }
-            else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
-                      ( -1 == sb_id_large /* have not found a larger */ ) &&
-                      ( state < block_state /* a larger block */ ) &&
-                      // is not full:
-                      ( used < ( 1u << ( state >> state_shift ) ) ) ) {
-              //  First superblock encountered that is
-              //  larger than this block size and
-              //  has room for an allocation.
-              //  Save this id to use of partfull or empty superblock not found
-              sb_id_large    = id ;
-              sb_state_large = state ;
-            }
-
-            // Iterate around the superblock array:
-
-            if ( ++id < m_sb_count ) {
-              sb_state_array += m_sb_state_size ;
-            }
-            else {
-              id = 0 ;
-              sb_state_array = m_sb_state_array ;
-            }
-          }
-
-// printf("  search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
-
-          if ( sb_id < 0 ) {
-
-            //  Did not find a partfull superblock for this block size.
-
-            if ( 0 <= sb_id_empty ) {
-
-              //  Found first empty superblock following designated superblock
-              //  Attempt to claim it for this block size.
-              //  If the claim fails assume that another thread claimed it
-              //  for this block size and try to use it anyway,
-              //  but do not update hint.
-
-              sb_id = sb_id_empty ;
-
-              sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
-
-              //  If successfully changed assignment of empty superblock 'sb_id'
-              //  to this block_size then update the hint.
-
-              const uint32_t state_empty = state_header_mask & *sb_state_array ;
-
-              // If this thread claims the empty block then update the hint
-              update_hint =
-                state_empty ==
-                  Kokkos::atomic_compare_exchange
-                    (sb_state_array,state_empty,block_state);
-            }
-            else if ( 0 <= sb_id_large ) {
-
-              // Found a larger superblock with space available
-
-              sb_id    = sb_id_large ;
-              sb_state = sb_state_large ;
-
-              sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
-            }
-            else {
-              // Did not find a potentially usable superblock
-              --attempt_limit ;
-            }
-          }
-
-          if ( update_hint ) {
-            Kokkos::atomic_compare_exchange
-              ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
-          }
-        } // end allocation attempt loop
-
-        //--------------------------------------------------------------------
-      }
-      else {
-        Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
-      }
+        if ( update_hint ) {
+          Kokkos::atomic_compare_exchange
+            ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
+        }
+      } // end allocation attempt loop
+      //--------------------------------------------------------------------

      return p ;
    }
@ -765,7 +802,7 @@ public:
        const uint32_t block_size_lg2 =
          m_sb_size_lg2 - ( block_state >> state_shift );

-        ok_block_aligned = 0 == ( d & ( ( 1 << block_size_lg2 ) - 1 ) );
+        ok_block_aligned = 0 == ( d & ( ( 1UL << block_size_lg2 ) - 1 ) );

        if ( ok_block_aligned ) {

@ -773,31 +810,70 @@ public:
          // mask into superblock and then shift down for block index

          const uint32_t bit =
-            ( d & ( ptrdiff_t( 1 << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
+            ( d & ( ptrdiff_t( 1LU << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;

          const int result =
            CB::release( sb_state_array , bit , block_state );

          ok_dealloc_once = 0 <= result ;

-// printf("  deallocate from sb_id(%d) result(%d) bit(%d) state(0x%x)\n"
-//       , sb_id
-//       , result
-//       , uint32_t(d >> block_size_lg2)
-//       , *sb_state_array );
-
+#if 0
+  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
+        , (uintptr_t)m_sb_state_array
+        , (uintptr_t)p
+        , sb_id
+        , (1u << block_size_lg2)
+        , (1u << (m_sb_size_lg2 - block_size_lg2))
+        , bit
+        , result );
+#endif
        }
      }

      if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) {
 #if 0
-        printf("Kokkos MemoryPool deallocate(0x%lx) contains(%d) block_aligned(%d) dealloc_once(%d)\n",(uintptr_t)p,ok_contains,ok_block_aligned,ok_dealloc_once);
+  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
+        , (uintptr_t)m_sb_state_array
+        , (uintptr_t)p
+        , int(ok_contains)
+        , int(ok_block_aligned)
+        , int(ok_dealloc_once) );
 #endif
        Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
      }
    }
  // end deallocate
  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int number_of_superblocks() const noexcept { return m_sb_count ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void superblock_state( int sb_id
+                       , int & block_size
+                       , int & block_count_capacity
+                       , int & block_count_used ) const noexcept
+    {
+      block_size           = 0 ;
+      block_count_capacity = 0 ;
+      block_count_used     = 0 ;
+
+      if ( Kokkos::Impl::MemorySpaceAccess
+             < Kokkos::Impl::ActiveExecutionMemorySpace
+             , base_memory_space >::accessible ) {
+       // Can access the state array
+       
+        const uint32_t state =
+          ((uint32_t volatile *)m_sb_state_array)[sb_id*m_sb_state_size];
+
+        const uint32_t block_count_lg2 = state >> state_shift ;
+        const uint32_t block_used      = state & state_used_mask ;
+
+        block_size           = 1LU << ( m_sb_size_lg2 - block_count_lg2 );
+        block_count_capacity = 1LU << block_count_lg2 ;
+        block_count_used     = block_used ;
+      }
+    }
 };

 } // namespace Kokkos 
--- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@ -97,26 +97,22 @@ typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryR
 namespace Kokkos {
 namespace Impl {

+static_assert(
+  ( 0 < int(KOKKOS_MEMORY_ALIGNMENT) ) &&
+  ( 0 == ( int(KOKKOS_MEMORY_ALIGNMENT) & (int(KOKKOS_MEMORY_ALIGNMENT)-1))) ,
+  "KOKKOS_MEMORY_ALIGNMENT must be a power of two" );
+
 /** \brief Memory alignment settings
 *
 *  Sets global value for memory alignment.  Must be a power of two!
 *  Enable compatibility of views from different devices with static stride.
 *  Use compiler flag to enable overwrites.
 */
-enum { MEMORY_ALIGNMENT =
-#if defined( KOKKOS_MEMORY_ALIGNMENT )
-    ( 1 << Kokkos::Impl::integral_power_of_two( KOKKOS_MEMORY_ALIGNMENT ) )
-#else
-    ( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
-#endif
-#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
+enum : unsigned
+  { MEMORY_ALIGNMENT           = KOKKOS_MEMORY_ALIGNMENT
  , MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
-#else
-  , MEMORY_ALIGNMENT_THRESHOLD = 4
-#endif
  };

-
 } //namespace Impl
 } // namespace Kokkos

--- a/lib/kokkos/core/src/Kokkos_ROCm.hpp
+++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp
@ -204,8 +204,8 @@ struct VerifyExecutionCanAccessMemorySpace
  >
 {
  enum { value = false };
-  inline static void verify( void ) { Experimental::ROCmSpace::access_error(); }
-  inline static void verify( const void * p ) { Experimental::ROCmSpace::access_error(p); }
+  inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
+  inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
 };
 } // namespace Experimental
 } // namespace Kokkos
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -619,16 +619,16 @@ namespace Impl {

 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType ,
-                   Kokkos::Experimental::MDRangePolicy< Traits ... > ,
+                   Kokkos::MDRangePolicy< Traits ... > ,
                   Kokkos::Serial
                 >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy Policy ;

-  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+  typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;

  const FunctorType   m_functor ;
  const MDRangePolicy m_mdr_policy ;
@ -661,14 +661,14 @@ public:

 template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
-                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , Kokkos::MDRangePolicy< Traits ... >
                    , ReducerType
                    , Kokkos::Serial
                    >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy Policy ;

  typedef typename MDRangePolicy::work_tag                                  WorkTag ;
@ -686,7 +686,7 @@ private:
  typedef typename Analysis::reference_type  reference_type ;


-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
                                                                           , FunctorType
                                                                           , WorkTag
                                                                           , ValueType
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@ -408,7 +408,7 @@ view_alloc( Args const & ... args )
 }

 template< class ... Args >
-inline
+KOKKOS_INLINE_FUNCTION
 Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
 view_wrap( Args const & ... args )
 {
@ -1216,6 +1216,13 @@ public:
      m_track.assign_allocated_record_to_uninitialized( record );
    }

+  KOKKOS_INLINE_FUNCTION
+  void assign_data( pointer_type arg_data )
+    {
+      m_track.clear();
+      m_map.assign_data( arg_data );
+    }
+
  // Wrap memory according to properties and array layout
  template< class ... P >
  explicit KOKKOS_INLINE_FUNCTION
@ -2235,6 +2242,29 @@ create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
  return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
 }

+// Create a mirror view and deep_copy in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
+  , std::string const& name = ""
+  , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  (void)name;
+  return src;
+}
+
+// Create a mirror view and deep_copy in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
+  , std::string const& name = ""
+  , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  using Mirror = typename Impl::MirrorViewType<Space,T,P ...>::view_type;
+  std::string label = name.empty() ? src.label() : name;
+  auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout());
+  deep_copy(mirror, src);
+  return mirror;
+}
+
 } /* namespace Kokkos */

 //----------------------------------------------------------------------------
@ -2432,6 +2462,7 @@ struct CommonViewAllocProp< void, ValueType >
  using scalar_array_type = ValueType;

  template < class ... Views >
+  KOKKOS_INLINE_FUNCTION
  CommonViewAllocProp( const Views & ... ) {}
 };

@ -2499,6 +2530,7 @@ using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...

 // User function
 template < class ... Views >
+KOKKOS_INLINE_FUNCTION
 DeducedCommonPropsType<Views...> 
 common_view_alloc_prop( Views const & ... views )
 {
--- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@ -46,205 +46,198 @@

 namespace Kokkos {
 namespace Impl {
-namespace Experimental {

 template< class functor_type , class execution_space, class ... policy_args >
 class WorkGraphExec;

-}}} // namespace Kokkos::Impl::Experimental
+}} // namespace Kokkos::Impl

 namespace Kokkos {
-namespace Experimental {

 template< class ... Properties >
 class WorkGraphPolicy
 {
 public:

-  using self_type = WorkGraphPolicy<Properties ... >;
-  using traits = Kokkos::Impl::PolicyTraits<Properties ... >;
-  using index_type = typename traits::index_type;
+  using self_type       = WorkGraphPolicy<Properties ... >;
+  using traits          = Kokkos::Impl::PolicyTraits<Properties ... >;
+  using index_type      = typename traits::index_type;
+  using member_type     = index_type;
+  using work_tag        = typename traits::work_tag;
  using execution_space = typename traits::execution_space;
-  using work_tag = typename traits::work_tag;
-  using memory_space = typename execution_space::memory_space;
-  using graph_type = Kokkos::Experimental::Crs<index_type, execution_space, void, index_type>;
-  using member_type = index_type;
+  using memory_space    = typename execution_space::memory_space;
+  using graph_type      = Kokkos::Crs<index_type,execution_space,void,index_type>;
+
+  enum : std::int32_t {
+    END_TOKEN       = -1 ,
+    BEGIN_TOKEN     = -2 ,
+    COMPLETED_TOKEN = -3 };

 private:
-   
-  graph_type m_graph;

  using ints_type = Kokkos::View<std::int32_t*, memory_space>;
-  using range_type = Kokkos::pair<std::int32_t, std::int32_t>;
-  using ranges_type = Kokkos::View<range_type*, memory_space>;
-  const std::int32_t m_total_work;
-  ints_type m_counts;
-  ints_type m_queue;
-  ranges_type m_ranges;
+
+  // Let N = m_graph.numRows(), the total work
+  // m_queue[  0 ..   N-1] = the ready queue
+  // m_queue[  N .. 2*N-1] = the waiting queue counts
+  // m_queue[2*N .. 2*N+2] = the ready queue hints
+
+  graph_type const m_graph;
+  ints_type        m_queue ;
+
+  KOKKOS_INLINE_FUNCTION
+  void push_work( const std::int32_t w ) const noexcept
+    {
+      const std::int32_t N = m_graph.numRows();
+
+      std::int32_t volatile * const ready_queue = & m_queue[0] ;
+      std::int32_t volatile * const end_hint    = & m_queue[2*N+1] ;
+
+      // Push work to end of queue
+      const std::int32_t j = atomic_fetch_add( end_hint , 1 );
+
+      if ( ( N <= j ) ||
+           ( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) {
+        // ERROR: past the end of queue or did not replace END_TOKEN
+        Kokkos::abort("WorkGraphPolicy push_work error");
+      }
+
+      memory_fence();
+    }

 public:

-  struct TagZeroRanges {};
+  /**\brief  Attempt to pop the work item at the head of the queue.
+   *
+   *  Find entry 'i' such that
+   *    ( m_queue[i] != BEGIN_TOKEN ) AND
+   *    ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN )
+   *  if found then
+   *    increment begin hint
+   *    return atomic_exchange( m_queue[i] , BEGIN_TOKEN )
+   *  else if i < total work
+   *    return END_TOKEN
+   *  else
+   *    return COMPLETED_TOKEN
+   *  
+   */
  KOKKOS_INLINE_FUNCTION
-  void operator()(TagZeroRanges, std::int32_t i) const {
-    m_ranges[i] = range_type(0, 0);
-  }
-  void zero_ranges() {
-    using policy_type = RangePolicy<std::int32_t, execution_space, TagZeroRanges>;
-    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
-    const closure_type closure(*this, policy_type(0, 1));
-    closure.execute();
-    execution_space::fence();
-  }
+  std::int32_t pop_work() const noexcept
+    {
+      const std::int32_t N = m_graph.numRows();

-  struct TagFillQueue {};
-  KOKKOS_INLINE_FUNCTION
-  void operator()(TagFillQueue, std::int32_t i) const {
-    if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i);
-  }
-  void fill_queue() {
-    using policy_type = RangePolicy<std::int32_t, execution_space, TagFillQueue>;
-    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
-    const closure_type closure(*this, policy_type(0, m_total_work));
-    closure.execute();
-    execution_space::fence();
-  }
+      std::int32_t volatile * const ready_queue = & m_queue[0] ;
+      std::int32_t volatile * const begin_hint  = & m_queue[2*N] ;

-private:
+      // begin hint is guaranteed to be less than or equal to
+      // actual begin location in the queue.

-  inline
-  void setup() {
-    if (m_graph.numRows() > std::numeric_limits<std::int32_t>::max()) {
-      Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t");
-    }
-    get_crs_transpose_counts(m_counts, m_graph);
-    m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work);
-    deep_copy(m_queue, std::int32_t(-1));
-    m_ranges = ranges_type("ranges", 1);
-    fill_queue();
-  }
+      for ( std::int32_t i = *begin_hint ; i < N ; ++i ) {

-  KOKKOS_INLINE_FUNCTION
-  std::int32_t pop_work() const {
-    range_type w(-1,-1);
-    while (true) {
-      const range_type w_new( w.first + 1 , w.second );
-      w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
-      if ( w.first < w.second ) { // there was work in the queue
-        if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
-          // we got a work item
-          std::int32_t i;
-          // the push_work function may have incremented the end counter
-          // but not yet written the work index into the queue.
-          // wait until the entry is valid.
-          while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) );
-          return i;
-        } // we got a work item
-      } else { // there was no work in the queue
-#ifdef KOKKOS_DEBUG
-        if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
-          Kokkos::abort("bug in pop_work");
+        const std::int32_t w = ready_queue[i] ;
+
+        if ( w == END_TOKEN ) { return END_TOKEN ; }
+
+        if ( ( w != BEGIN_TOKEN ) &&
+             ( w == atomic_compare_exchange(ready_queue+i,w,BEGIN_TOKEN) ) ) {
+          // Attempt to claim ready work index succeeded,
+          // update the hint and return work index
+          atomic_increment( begin_hint );
+          return w ;
        }
-#endif
-        if (w.first == m_total_work) { // all work is done
-          return -1;
-        } else { // need to wait for more work to be pushed
-          // take a guess that one work item will be pushed
-          // the key thing is we can't leave (w) alone, because
-          // otherwise the next compare_exchange may succeed in
-          // popping work from an empty queue
-          w.second++;
-        }
-      } // there was no work in the queue
-    } // while (true)
-  }
+        // arrive here when ready_queue[i] == BEGIN_TOKEN
+      }
+
+      return COMPLETED_TOKEN ;
+    }
+

  KOKKOS_INLINE_FUNCTION
-  void push_work(std::int32_t i) const {
-    range_type w(-1,-1);
-    while (true) {
-      const range_type w_new( w.first , w.second + 1 );
-      // try to increment the end counter
-      w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
-      // stop trying if the increment was successful
-      if ( w.first == w_new.first && w.second + 1 == w_new.second ) break;
+  void completed_work( std::int32_t w ) const noexcept
+    {
+      Kokkos::memory_fence();
+
+      // Make sure the completed work function's memory accesses are flushed.
+
+      const std::int32_t N = m_graph.numRows();
+
+      std::int32_t volatile * const count_queue = & m_queue[N] ;
+
+      const std::int32_t B = m_graph.row_map(w);
+      const std::int32_t E = m_graph.row_map(w+1);
+
+      for ( std::int32_t i = B ; i < E ; ++i ) {
+        const std::int32_t j = m_graph.entries(i);
+        if ( 1 == atomic_fetch_add(count_queue+j,-1) ) {
+          push_work(j);
+        }
+      }
    }
-    // write the work index into the claimed spot in the queue
-    *((volatile std::int32_t*)(&m_queue( w.second ))) = i;
-    // push this write out into the memory system
-    memory_fence();
-  }

-  template< class functor_type , class execution_space, class ... policy_args >
-  friend class Kokkos::Impl::Experimental::WorkGraphExec;
+  struct TagInit {};
+  struct TagCount {};
+  struct TagReady {};

-public:
+  /**\brief  Initialize queue
+   *
+   *  m_queue[0..N-1] = END_TOKEN, the ready queue
+   *  m_queue[N..2*N-1] = 0, the waiting count queue
+   *  m_queue[2*N..2*N+1] = 0, begin/end hints for ready queue
+   */
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit , int i ) const noexcept
+    { m_queue[i] = i < m_graph.numRows() ? END_TOKEN : 0 ; }

-  WorkGraphPolicy(graph_type arg_graph)
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagCount , int i ) const noexcept
+    {
+      std::int32_t volatile * const count_queue =
+        & m_queue[ m_graph.numRows() ] ;
+
+      atomic_increment( count_queue + m_graph.entries[i] );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagReady , int w ) const noexcept
+    {
+      std::int32_t const * const count_queue =
+        & m_queue[ m_graph.numRows() ] ;
+
+      if ( 0 == count_queue[w] ) push_work(w);
+    }
+
+  WorkGraphPolicy( const graph_type & arg_graph )
    : m_graph(arg_graph)
-    , m_total_work( arg_graph.numRows() )
+    , m_queue( view_alloc( "queue" , WithoutInitializing )
+             , arg_graph.numRows() * 2 + 2 )
  {
-    setup();
-  }
+    { // Initialize
+      using policy_type = RangePolicy<std::int32_t, execution_space, TagInit>;
+      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+      const closure_type closure(*this, policy_type(0, m_queue.size()));
+      closure.execute();
+      execution_space::fence();
+    }

-};
+    { // execute-after counts
+      using policy_type = RangePolicy<std::int32_t, execution_space, TagCount>;
+      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+      const closure_type closure(*this,policy_type(0,m_graph.entries.size()));
+      closure.execute();
+      execution_space::fence();
+    }

-}} // namespace Kokkos::Experimental
-
-/*--------------------------------------------------------------------------*/
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-namespace Experimental {
-
-template< class functor_type , class execution_space, class ... policy_args >
-class WorkGraphExec
-{
- public:
-
-  using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >;
-  using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >;
-  using member_type = typename policy_type::member_type;
-  using memory_space = typename execution_space::memory_space;
-
- protected:
-
-  const functor_type m_functor;
-  const policy_type  m_policy;
-
- protected:
-
-  KOKKOS_INLINE_FUNCTION
-  std::int32_t before_work() const {
-    return m_policy.pop_work();
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void after_work(std::int32_t i) const {
-    /* fence any writes that were done by the work item itself
-       (usually writing its result to global memory) */
-    memory_fence();
-    const std::int32_t begin = m_policy.m_graph.row_map( i );
-    const std::int32_t end = m_policy.m_graph.row_map( i + 1 );
-    for (std::int32_t j = begin; j < end; ++j) {
-      const std::int32_t next = m_policy.m_graph.entries( j );
-      const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 );
-      if ( old_count == 1 )  m_policy.push_work( next );
+    { // Scheduling ready tasks
+      using policy_type = RangePolicy<std::int32_t, execution_space, TagReady>;
+      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+      const closure_type closure(*this,policy_type(0,m_graph.numRows()));
+      closure.execute();
+      execution_space::fence();
    }
  }
-
-  inline
-  WorkGraphExec( const functor_type & arg_functor
-               , const policy_type  & arg_policy )
-    : m_functor( arg_functor )
-    , m_policy(  arg_policy )
-  {
-  }
 };

-}}} // namespace Kokkos::Impl::Experimental
+} // namespace Kokkos

 #ifdef KOKKOS_ENABLE_SERIAL
 #include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@ -294,7 +294,7 @@ void OpenMP::initialize( int thread_count )
  }

  {
-    if (nullptr == std::getenv("OMP_PROC_BIND") ) {
+    if ( Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND") ) {
      printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
      printf("  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
      printf("  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
@ -327,7 +327,7 @@ void OpenMP::initialize( int thread_count )
      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
    }
    else {
-      if( thread_count > process_num_threads ) {
+      if( Kokkos::show_warnings() && thread_count > process_num_threads ) {
        printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
        printf( "  process threads available : %3d,  requested thread : %3d\n", process_num_threads, thread_count );
      }
@ -364,12 +364,12 @@ void OpenMP::initialize( int thread_count )


  // Check for over-subscription
-  //if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
-  //  std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
-  //  std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
-  //  std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
-  //  std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
-  //}
+  if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
+    std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
+    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
+    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
+    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
+  }
  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();

--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@ -170,20 +170,20 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType
-                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::MDRangePolicy< Traits ... >
                 , Kokkos::OpenMP
                 >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy         Policy ;
  typedef typename MDRangePolicy::work_tag                  WorkTag ;

  typedef typename Policy::WorkRange    WorkRange ;
  typedef typename Policy::member_type  Member ;

-  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+  typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;

        OpenMPExec   * m_instance ;
  const FunctorType   m_functor ;
@ -445,14 +445,14 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
-                    , Kokkos::Experimental::MDRangePolicy< Traits ...>
+                    , Kokkos::MDRangePolicy< Traits ...>
                    , ReducerType
                    , Kokkos::OpenMP
                    >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy         Policy ;

  typedef typename MDRangePolicy::work_tag                  WorkTag ;
@ -472,7 +472,7 @@ private:
  typedef typename Analysis::pointer_type    pointer_type ;
  typedef typename Analysis::reference_type  reference_type ;

-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
                                                                           , FunctorType
                                                                           , WorkTag
                                                                           , ValueType
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
@ -49,33 +49,26 @@ namespace Impl {

 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType ,
-                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::WorkGraphPolicy< Traits ... > ,
                   Kokkos::OpenMP
                 >
-  : public Kokkos::Impl::Experimental::
-           WorkGraphExec< FunctorType,
-                          Kokkos::OpenMP,
-                          Traits ...
-                        >
 {
 private:

-  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
-  typedef Kokkos::Impl::Experimental::
-          WorkGraphExec<FunctorType, Kokkos::OpenMP, Traits ... > Base ;
+  typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
+
+  Policy       m_policy ;
+  FunctorType  m_functor ;

  template< class TagType >
  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    Base::m_functor( i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }

  template< class TagType >
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    const TagType t{} ;
-    Base::m_functor( t , i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{} ; m_functor( t , w ); }

 public:

@ -86,9 +79,15 @@ public:

    #pragma omp parallel num_threads(pool_size)
    {
-      for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
-        exec_one< typename Policy::work_tag >( i );
-        Base::after_work(i);
+      // Spin until COMPLETED_TOKEN.
+      // END_TOKEN indicates no work is currently available.
+
+      for ( std::int32_t w = Policy::END_TOKEN ;
+            Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+        if ( Policy::END_TOKEN != w ) {
+          exec_one< typename Policy::work_tag >( w );
+          m_policy.completed_work(w);
+        }
      }
    }
  }
@ -96,12 +95,13 @@ public:
  inline
  ParallelFor( const FunctorType & arg_functor
             , const Policy      & arg_policy )
-    : Base( arg_functor, arg_policy )
-  {
-  }
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+  {}
 };

 } // namespace Impl
 } // namespace Kokkos

 #endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */
+
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
@ -125,7 +125,7 @@ namespace Kokkos {
    oldval.t = *dest ;
    assume.i = oldval.i ;
    newval.t = val ;
-    atomic_compare_exchange( reinterpret_cast<int*>(dest) , assume.i, newval.i );
+    atomic_compare_exchange( (int*)(dest) , assume.i, newval.i );

    return oldval.t ;    
  }
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
@ -608,6 +608,7 @@ ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size )

 void ROCmInternal::finalize()
 {
+  Kokkos::Impl::rocm_device_synchronize();
  was_finalized = 1;
  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {

--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
@ -277,7 +277,7 @@ public:
      this->team_barrier();
      value = local_value;
    }
-// Reduce accross a team of threads.
+// Reduce across a team of threads.
 //
 // Each thread has vector_length elements.
 // This reduction is for TeamThreadRange operations, where the range
@ -354,6 +354,80 @@ public:
      return buffer[0];
    }

+// Reduce across a team of threads, with a reducer data type
+//
+// Each thread has vector_length elements.
+// This reduction is for TeamThreadRange operations, where the range
+// is spread across threads.  Effectively, there are vector_length
+// independent reduction operations.
+// This is different from a reduction across the elements of a thread,
+// which reduces every vector element.
+
+    template< class ReducerType >
+    KOKKOS_INLINE_FUNCTION
+    typename std::enable_if< is_reducer< ReducerType >::value >::type
+    team_reduce( const ReducerType & reducer) const
+    {
+      typedef typename ReducerType::value_type value_type ;
+
+      tile_static value_type buffer[512];
+      const auto local = lindex();
+      const auto team  = team_rank();
+      auto vector_rank = local%m_vector_length;
+      auto thread_base = team*m_vector_length;
+
+      const std::size_t size = next_pow_2(m_team_size+1)/2;
+#if defined(ROCM15)
+      buffer[local] = reducer.reference();
+#else
+        // ROCM 1.5 handles address spaces better, previous version didn't
+      lds_for(buffer[local], [&](ValueType& x)
+      {
+          x = value;
+      });
+#endif
+      m_idx.barrier.wait();
+
+      for(std::size_t s = 1; s < size; s *= 2)
+      {
+          const std::size_t index = 2 * s * team;
+          if (index < size)
+          {
+#if defined(ROCM15)
+                reducer.join(buffer[vector_rank+index*m_vector_length],
+                        buffer[vector_rank+(index+s)*m_vector_length]);
+#else
+              lds_for(buffer[vector_rank+index*m_vector_length], [&](ValueType& x)
+              {
+                  lds_for(buffer[vector_rank+(index+s)*m_vector_length],
+                                [&](ValueType& y)
+                  {
+                      reducer.join(x, y);
+                  });
+              });
+#endif
+          }
+          m_idx.barrier.wait();
+      }
+
+      if (local == 0)
+      {
+          for(int i=size*m_vector_length; i<m_team_size*m_vector_length; i+=m_vector_length)
+#if defined(ROCM15)
+              reducer.join(buffer[vector_rank], buffer[vector_rank+i]);
+#else
+              lds_for(buffer[vector_rank], [&](ValueType& x)
+              {
+                  lds_for(buffer[vector_rank+i],
+                                [&](ValueType& y)
+                  {
+                      reducer.join(x, y);
+                  });
+              });
+#endif
+      }
+      m_idx.barrier.wait();
+    }

    /** \brief  Intra-team vector reduce 
     *          with intra-team non-deterministic ordering accumulation.
@ -406,6 +480,33 @@ public:
      return buffer[thread_base];
    }

+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  vector_reduce( ReducerType const & reducer )
+    {
+      #ifdef __HCC_ACCELERATOR__
+      if(blockDim_x == 1) return;
+
+      // Intra vector lane shuffle reduction:
+      typename ReducerType::value_type tmp ( reducer.reference() );
+
+      for ( int i = blockDim_x ; ( i >>= 1 ) ; ) {
+        shfl_down( reducer.reference() , i , blockDim_x );
+        if ( (int)threadIdx_x < i ) { reducer.join( tmp , reducer.reference() ); }
+      }
+
+      // Broadcast from root lane to all other lanes.
+      // Cannot use "butterfly" algorithm to avoid the broadcast
+      // because floating point summation is not associative
+      // and thus different threads could have different results.
+
+      shfl( reducer.reference() , 0 , blockDim_x );
+      #endif
+    }
+
+
+
    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
     *          with intra-team non-deterministic ordering accumulation.
     *
@ -1075,6 +1176,22 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROC
 //               Impl::JoinAdd<ValueType>());
 }

+/** \brief  Inter-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ReducerType const & reducer) {
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+  loop_boundaries.thread.team_reduce(reducer);
+}
+
 /** \brief  Intra-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
 *
 * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
@ -1161,6 +1278,41 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::R
  result = loop_boundaries.thread.thread_reduce(result,join);
 }

+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
+      loop_boundaries, const Lambda & lambda, ReducerType const & reducer) {
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+  loop_boundaries.thread.vector_reduce(reducer);
+}
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ReducerType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ReducerType const & reducer) {
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());  
+    loop_boundaries.thread.team_barrier();
+  }
+  reducer.reference() = loop_boundaries.thread.thread_reduce(reducer.reference(),join);
+}
+
 /** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
 *          for each i=0..N-1.
 *
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@ -266,7 +266,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );

  for ( int i = 0 ; i < n ; ++i ) {
-    Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+    Impl::spinwait_while_equal<int>( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
  }

  exec.m_pool_state = ThreadsExec::Inactive ;
@ -310,7 +310,7 @@ void ThreadsExec::fence()
 {
  if ( s_thread_pool_size[0] ) {
    // Wait for the root thread to complete:
-    Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+    Impl::spinwait_while_equal<int>( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
  }

  s_current_function     = 0 ;
@ -716,12 +716,12 @@ void ThreadsExec::initialize( unsigned thread_count ,
  }

  // Check for over-subscription
-  //if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
-  //  std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
-  //  std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
-  //  std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
-  //  std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
-  //}
+  if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
+    std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
+    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
+    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
+    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
+  }

  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@ -50,6 +50,7 @@
 #include <cstdio>

 #include <utility>
+#include <cstdalign>
 #include <impl/Kokkos_Spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>

@ -107,7 +108,7 @@ private:
  // Which thread am I stealing from currently
  int m_current_steal_target;
  // This thread's owned work_range
-  Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN(16);
+  Kokkos::pair<long,long> m_work_range __attribute__((aligned(16))) ;
  // Team Offset if one thread determines work_range for others
  long m_team_work_index;

@ -191,13 +192,13 @@ public:
      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
-        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
-        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the reduction and broadcast
@ -233,13 +234,13 @@ public:
      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
-        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
-        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the reduction and broadcast
@ -268,7 +269,7 @@ public:

        ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;

-        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );

        Join::join( f , reduce_memory() , fan.reduce_memory() );
      }
@ -295,7 +296,7 @@ public:
      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );

      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
-        Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
      }
    }

@ -327,7 +328,7 @@ public:
        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];

        // Wait: Active -> ReductionAvailable (or ScanAvailable)
-        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );
        Join::join( f , work_value , fan.reduce_memory() );
      }

@ -345,8 +346,8 @@ public:

          // Wait: Active             -> ReductionAvailable
          // Wait: ReductionAvailable -> ScanAvailable
-          Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active );
-          Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable );
+          Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::ReductionAvailable );

          Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
        }
@ -357,7 +358,7 @@ public:

        // Wait for all threads to complete inclusive scan
        // Wait: ScanAvailable -> Rendezvous
-        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable );
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanAvailable );
      }

      //--------------------------------
@ -365,7 +366,7 @@ public:
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
        // Wait: ReductionAvailable -> ScanAvailable
-        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::ReductionAvailable );
        // Set: ScanAvailable -> Rendezvous
        fan.m_pool_state = ThreadsExec::Rendezvous ;
      }
@ -392,13 +393,13 @@ public:
      // Wait for all threads to copy previous thread's inclusive scan value
      // Wait for all threads: Rendezvous -> ScanCompleted
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
-        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
      }
      if ( rev_rank ) {
        // Set: ScanAvailable -> ScanCompleted
        m_pool_state = ThreadsExec::ScanCompleted ;
        // Wait: ScanCompleted -> Active
-        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted );
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanCompleted );
      }
      // Set: ScanCompleted -> Active
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
@ -425,7 +426,7 @@ public:
      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
-        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
@ -433,7 +434,7 @@ public:
      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
-        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the thread-scan before releasing threads
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@ -107,13 +107,13 @@ public:

      // Wait for fan-in threads
      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
-        Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active );
+        Impl::spinwait_while_equal<int>( m_team_base[j]->state() , ThreadsExec::Active );
      }

      // If not root then wait for release
      if ( m_team_rank_rev ) {
        m_exec->state() = ThreadsExec::Rendezvous ;
-        Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal<int>( m_exec->state() , ThreadsExec::Rendezvous );
      }

      return ! m_team_rank_rev ;
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@ -180,12 +180,12 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType
-                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::MDRangePolicy< Traits ... >
                 , Kokkos::Threads
                 >
 {
 private:
-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy         Policy ;

  typedef typename MDRangePolicy::work_tag                  WorkTag ;
@ -193,7 +193,7 @@ private:
  typedef typename Policy::WorkRange   WorkRange ;
  typedef typename Policy::member_type Member ;

-  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+  typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;

  const FunctorType   m_functor ;
  const MDRangePolicy m_mdr_policy ;
@ -548,14 +548,14 @@ public:
 // MDRangePolicy impl
 template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
-                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , Kokkos::MDRangePolicy< Traits ... >
                    , ReducerType
                    , Kokkos::Threads
                    >
 {
 private:

-  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
  typedef typename MDRangePolicy::impl_range_policy Policy ;

  typedef typename MDRangePolicy::work_tag    WorkTag ;
@ -573,7 +573,7 @@ private:
  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;

-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
                                                                           , FunctorType
                                                                           , WorkTag
                                                                           , ValueType
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
@ -49,48 +49,50 @@ namespace Impl {

 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType ,
-                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::WorkGraphPolicy< Traits ... > ,
                   Kokkos::Threads
                 >
-  : public Kokkos::Impl::Experimental::
-           WorkGraphExec< FunctorType,
-                          Kokkos::Threads,
-                          Traits ...
-                        >
 {
 private:

-  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
-  typedef Kokkos::Impl::Experimental::
-          WorkGraphExec<FunctorType, Kokkos::Threads, Traits ... > Base ;
+  typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
+
  typedef ParallelFor<FunctorType,
-                      Kokkos::Experimental::WorkGraphPolicy<Traits ...>,
+                      Kokkos::WorkGraphPolicy<Traits ...>,
                      Kokkos::Threads> Self ;

+  Policy       m_policy ;
+  FunctorType  m_functor ;
+
  template< class TagType >
  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    Base::m_functor( i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }

  template< class TagType >
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    const TagType t{} ;
-    Base::m_functor( t , i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{}; m_functor( t , w ); }

-  inline void exec_one_thread() const {
-    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
-      exec_one< typename Policy::work_tag >( i );
-      Base::after_work(i);
+  inline void exec_one_thread() const noexcept 
+    {
+      // Spin until COMPLETED_TOKEN.
+      // END_TOKEN indicates no work is currently available.
+      
+      for ( std::int32_t w = Policy::END_TOKEN ;
+            Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+        if ( Policy::END_TOKEN != w ) {
+          exec_one< typename Policy::work_tag >( w );
+          m_policy.completed_work(w);
+        }
+      }
    }
-  }

-  static inline void thread_main( ThreadsExec&, const void* arg ) {
-    const Self& self = *(static_cast<const Self*>(arg));
-    self.exec_one_thread();
-  }
+  static inline void thread_main( ThreadsExec&, const void* arg ) noexcept
+    {
+      const Self& self = *(static_cast<const Self*>(arg));
+      self.exec_one_thread();
+    }

 public:

@ -104,9 +106,9 @@ public:
  inline
  ParallelFor( const FunctorType & arg_functor
             , const Policy      & arg_policy )
-    : Base( arg_functor, arg_policy )
-  {
-  }
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+    {}
 };

 } // namespace Impl
--- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@ -59,7 +59,7 @@
 #include <algorithm>
 #include <cstdio>

-namespace Kokkos { namespace Experimental { namespace Impl {
+namespace Kokkos { namespace Impl {

 // Temporary, for testing new loop macros
 #define KOKKOS_ENABLE_NEW_LOOP_MACROS 1
@ -1274,7 +1274,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i


 template <typename T>
-using is_void = std::is_same< T , void >;
+using is_void_type = std::is_same< T , void >;

 template <typename T>
 struct is_type_array : std::false_type 
@ -1303,7 +1303,7 @@ template < typename RP
         , typename Tag
         , typename ValueType
         >
-struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void<ValueType >::value >::type >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void_type<ValueType >::value >::type >
 {
  using index_type = typename RP::index_type;
  using point_type = typename RP::point_type;
@ -1781,7 +1781,7 @@ template < typename RP
         , typename Tag
         , typename ValueType
         >
-struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && !is_type_array<ValueType>::value >::type >
 {
  using index_type = typename RP::index_type;
  using point_type = typename RP::point_type;
@ -2268,7 +2268,7 @@ template < typename RP
         , typename Tag
         , typename ValueType
         >
-struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && is_type_array<ValueType>::value >::type >
 {
  using index_type = typename RP::index_type;
  using point_type = typename RP::point_type;
@ -2750,6 +2750,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
 // Cuda uses DeviceIterateTile directly within md_parallel_for
 // TODO Once md_parallel_{for,reduce} removed, this can be removed

+namespace Experimental { 
+
 // ParallelReduce - scalar reductions
 template < typename MDRange, typename Functor, typename ValueType = void >
 struct MDFunctor
@ -2759,11 +2761,11 @@ struct MDFunctor
  using value_type   = ValueType;
  using work_tag     = typename range_policy::work_tag;
  using index_type   = typename range_policy::index_type;
-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
-                                                                           , Functor
-                                                                           , work_tag
-                                                                           , value_type
-                                                                           >;
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
+                                                             , Functor
+                                                             , work_tag
+                                                             , value_type
+                                                             >;


  inline
@ -2804,11 +2806,11 @@ struct MDFunctor< MDRange, Functor, ValueType[] >
  using value_type   = ValueType[];
  using work_tag     = typename range_policy::work_tag;
  using index_type   = typename range_policy::index_type;
-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
-                                                                           , Functor
-                                                                           , work_tag
-                                                                           , value_type
-                                                                           >;
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
+                                                             , Functor
+                                                             , work_tag
+                                                             , value_type
+                                                             >;


  inline
@ -2852,11 +2854,11 @@ struct MDFunctor< MDRange, Functor, void >
  using functor_type = Functor;
  using work_tag     = typename range_policy::work_tag;
  using index_type   = typename range_policy::index_type;
-  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
-                                                                           , Functor
-                                                                           , work_tag
-                                                                           , void
-                                                                           >;
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
+                                                             , Functor
+                                                             , work_tag
+                                                             , void
+                                                             >;


  inline
@ -2887,8 +2889,9 @@ struct MDFunctor< MDRange, Functor, void >
  Functor m_func;
 };

+} // end namespace Experimental
 #undef KOKKOS_ENABLE_NEW_LOOP_MACROS

-} } } //end namespace Kokkos::Experimental::Impl
+} } //end namespace Kokkos::Impl

 #endif
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -51,9 +51,12 @@

 //----------------------------------------------------------------------------

-namespace Kokkos {
-namespace Impl {
 namespace {
+bool g_is_initialized = false;
+bool g_show_warnings = true;
+}
+
+namespace Kokkos { namespace Impl { namespace {

 bool is_unsigned_int(const char* str)
 {
@ -75,6 +78,10 @@ void initialize_internal(const InitArguments& args)
 setenv("MEMKIND_HBW_NODES", "1", 0);
 #endif

+  if (args.disable_warnings) {
+    g_show_warnings = false;
+  }
+
  // Protect declarations, to prevent "unused variable" warnings.
 #if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMPTARGET )
  const int num_threads = args.num_threads;
@ -177,6 +184,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
 #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
 #endif
+    g_is_initialized = true;
 }

 void finalize_internal( const bool all_spaces = false )
@ -233,6 +241,9 @@ void finalize_internal( const bool all_spaces = false )
      Kokkos::Serial::finalize();
  }
 #endif
+
+  g_is_initialized = false;
+  g_show_warnings = true;
 }

 void fence_internal()
@ -306,9 +317,7 @@ bool check_int_arg(char const* arg, char const* expected, int* value) {
  return true;
 }

-} // namespace
-} // namespace Impl
-} // namespace Kokkos
+}}} // namespace Kokkos::Impl::{unnamed}

 //----------------------------------------------------------------------------

@ -319,6 +328,7 @@ void initialize(int& narg, char* arg[])
    int num_threads = -1;
    int numa = -1;
    int device = -1;
+    bool disable_warnings = false;

    int kokkos_threads_found = 0;
    int kokkos_numa_found = 0;
@ -415,6 +425,12 @@ void initialize(int& narg, char* arg[])
        } else {
          iarg++;
        }
+      } else if ( strcmp(arg[iarg],"--kokkos-disable-warnings") == 0) {
+        disable_warnings = true;
+        for(int k=iarg;k<narg-1;k++) {
+          arg[k] = arg[k+1];
+        }
+        narg--;
      } else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
         std::cout << std::endl;
         std::cout << "--------------------------------------------------------------------------------" << std::endl;
@ -427,6 +443,7 @@ void initialize(int& narg, char* arg[])
         std::cout << "settings." << std::endl;
         std::cout << std::endl;
         std::cout << "--kokkos-help               : print this message" << std::endl;
+         std::cout << "--kokkos-disable-warnings   : disable kokkos warning messages" << std::endl;
         std::cout << "--kokkos-threads=INT        : specify total number of threads or" << std::endl;
         std::cout << "                              number of threads per NUMA region if " << std::endl;
         std::cout << "                              used in conjunction with '--numa' option. " << std::endl;
@ -457,7 +474,7 @@ void initialize(int& narg, char* arg[])
      iarg++;
    }

-    InitArguments arguments{num_threads, numa, device};
+    InitArguments arguments{num_threads, numa, device, disable_warnings};
    Impl::initialize_internal(arguments);
 }

@ -787,5 +804,9 @@ void print_configuration( std::ostream & out , const bool detail )
  out << msg.str() << std::endl;
 }

+bool is_initialized() noexcept { return g_is_initialized; }
+
+bool show_warnings() noexcept { return g_show_warnings; }
+
 } // namespace Kokkos

--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@ -476,7 +476,7 @@ template< class FunctorType , class ArgTag , class T , class Enable >
 struct FunctorValueInit< FunctorType , ArgTag , T & , Enable >
 {
  KOKKOS_FORCEINLINE_FUNCTION static
-  T & init( const FunctorType & f , void * p )
+  T & init( const FunctorType & , void * p )
    { return *( new(p) T() ); };
 };

--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@ -254,7 +254,12 @@ void * HostSpace::allocate( const size_t arg_alloc_size ) const
 }


-void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
+void HostSpace::deallocate( void * const arg_alloc_ptr
+    , const size_t
+#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
+    arg_alloc_size
+#endif
+    ) const
 {
  if ( arg_alloc_ptr ) {

@ -409,7 +414,7 @@ SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr

 // Iterate records to print orphaned memory ...
 void SharedAllocationRecord< Kokkos::HostSpace , void >::
-print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail )
+print_records( std::ostream & s , const Kokkos::HostSpace & , bool detail )
 {
  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail );
 }
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@ -44,6 +44,9 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE_HPP )
 #define KOKKOS_MEMORY_FENCE_HPP
+
+#include <atomic>
+
 namespace Kokkos {

 //----------------------------------------------------------------------------
@ -53,23 +56,8 @@ void memory_fence()
 {
 #if defined( __CUDA_ARCH__ )
  __threadfence();
-#elif defined( KOKKOS_ENABLE_ROCM_ATOMICS )
-  amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
-#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
-  asm volatile (
-	  "mfence" ::: "memory"
-  );
-#elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
-      ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
-  __sync_synchronize();
-#elif defined( KOKKOS_ENABLE_INTEL_ATOMICS )
-  _mm_mfence();
-#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
-  #pragma omp flush
-#elif defined( KOKKOS_ENABLE_WINDOWS_ATOMICS )
-  MemoryBarrier();
 #else
- #error "Error: memory_fence() not defined"
+  std::atomic_thread_fence( std::memory_order_seq_cst );
 #endif
 }

@ -81,12 +69,10 @@ void memory_fence()
 KOKKOS_FORCEINLINE_FUNCTION
 void store_fence()
 {
-#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
-  asm volatile (
-	  "sfence" ::: "memory"
-  );
+#if defined( __CUDA_ARCH__ )
+  __threadfence();
 #else
-  memory_fence();
+  std::atomic_thread_fence( std::memory_order_seq_cst );
 #endif
 }

@ -98,12 +84,10 @@ void store_fence()
 KOKKOS_FORCEINLINE_FUNCTION
 void load_fence()
 {
-#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
-  asm volatile (
-	  "lfence" ::: "memory"
-  );
+#if defined( __CUDA_ARCH__ )
+  __threadfence();
 #else
-  memory_fence();
+  std::atomic_thread_fence( std::memory_order_seq_cst );
 #endif
 }

--- a/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
@ -49,51 +49,50 @@ namespace Impl {

 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType ,
-                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::WorkGraphPolicy< Traits ... > ,
                   Kokkos::Serial
                 >
-  : public Kokkos::Impl::Experimental::
-           WorkGraphExec< FunctorType,
-                          Kokkos::Serial,
-                          Traits ...
-                        >
 {
 private:

-  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
-  typedef Kokkos::Impl::Experimental::
-          WorkGraphExec<FunctorType, Kokkos::Serial, Traits ... > Base ;
+  typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
+
+  Policy       m_policy ;
+  FunctorType  m_functor ;

  template< class TagType >
  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    Base::m_functor( i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }

  template< class TagType >
  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_one(const typename Policy::member_type& i) const {
-    const TagType t{} ;
-    Base::m_functor( t , i );
-  }
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{}; m_functor( t , w ); }

 public:

  inline
-  void execute()
-  {
-    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
-      exec_one< typename Policy::work_tag >( i );
-      Base::after_work(i);
+  void execute() const noexcept
+    {
+      // Spin until COMPLETED_TOKEN.
+      // END_TOKEN indicates no work is currently available.
+        
+      for ( std::int32_t w = Policy::END_TOKEN ;
+            Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+        if ( Policy::END_TOKEN != w ) {
+          exec_one< typename Policy::work_tag >( w );
+          m_policy.completed_work(w);
+        }
+      }
    }
-  }

  inline
  ParallelFor( const FunctorType & arg_functor
             , const Policy      & arg_policy )
-    : Base( arg_functor, arg_policy )
-  {
-  }
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+    {}
 };

 } // namespace Impl
--- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@ -306,7 +306,7 @@ print_host_accessible_records( std::ostream & s
              , reinterpret_cast<uintptr_t>( r->m_dealloc )
              , r->m_alloc_ptr->m_label
              );
-      std::cout << buffer ;
+      s << buffer ;
      r = r->m_next ;
    } while ( r != root );
  }
@ -334,7 +334,7 @@ print_host_accessible_records( std::ostream & s
      else {
        snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name );
      }
-      std::cout << buffer ;
+      s << buffer ;
      r = r->m_next ;
    } while ( r != root );
  }
--- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
@ -294,9 +294,13 @@ public:

  template< class MemorySpace >
  constexpr
-  SharedAllocationRecord< MemorySpace , void > &
-  get_record() const
-    { return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
+  SharedAllocationRecord< MemorySpace , void > *
+  get_record() const noexcept
+    {
+      return ( m_record_bits & DO_NOT_DEREF_FLAG )
+             ? (SharedAllocationRecord< MemorySpace,void>*) 0
+             : static_cast<SharedAllocationRecord<MemorySpace,void>*>(m_record);
+    }

  template< class MemorySpace >
  std::string get_label() const
@ -323,6 +327,16 @@ public:
    return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0;
  }

+  KOKKOS_FORCEINLINE_FUNCTION
+  void clear()
+    {
+      // If this is tracking then must decrement
+      KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT
+      // Reset to default constructed value.
+      m_record_bits = DO_NOT_DEREF_FLAG ;
+    }
+
+  // Copy:
  KOKKOS_FORCEINLINE_FUNCTION
  ~SharedAllocationTracker()
    { KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT }
--- a/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp
@ -48,7 +48,7 @@
 #include <impl/Kokkos_Spinwait.hpp>
 #include <impl/Kokkos_BitOps.hpp>

-#if defined( KOKKOS_ENABLE_STDTHREAD )
+#if defined( KOKKOS_ENABLE_STDTHREAD) || defined( _WIN32 )
  #include <thread>
 #elif !defined( _WIN32 )
  #include <sched.h>
@ -63,9 +63,8 @@

 namespace Kokkos {
 namespace Impl {
-namespace {

-void host_thread_yield( const uint32_t i , const int force_yield )
+void host_thread_yield( const uint32_t i , const WaitMode mode )
 {
  static constexpr uint32_t sleep_limit = 1 << 13 ;
  static constexpr uint32_t yield_limit = 1 << 12 ;
@ -76,28 +75,26 @@ void host_thread_yield( const uint32_t i , const int force_yield )

    // Attempt to put the thread to sleep for 'c' milliseconds

-    #if defined( KOKKOS_ENABLE_STDTHREAD )
-      std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) )
-    #elif !defined( _WIN32 )
+    #if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
+      auto start = std::chrono::high_resolution_clock::now();
+      std::this_thread::yield();
+      std::this_thread::sleep_until( start + std::chrono::nanoseconds( c * 1000 ) );
+    #else
      timespec req ;
      req.tv_sec  = 0 ;
      req.tv_nsec = 1000 * c ;
      nanosleep( &req, nullptr );
-    #else /* defined( _WIN32 ) IS Microsoft Windows */
-      Sleep(c);
    #endif
  }

-  else if ( force_yield || yield_limit < i ) {
+  else if ( mode == WaitMode::PASSIVE || yield_limit < i ) {

    // Attempt to yield thread resources to runtime

-    #if defined( KOKKOS_ENABLE_STDTHREAD )
+    #if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
      std::this_thread::yield();
-    #elif !defined( _WIN32 )
+    #else
      sched_yield();
-    #else /* defined( _WIN32 ) IS Microsoft Windows */
-      YieldProcessor();
    #endif
  }

@ -110,9 +107,9 @@ void host_thread_yield( const uint32_t i , const int force_yield )
    for ( int k = 0 ; k < c ; ++k ) {
      #if defined( __amd64 ) || defined( __amd64__ ) || \
       	    defined( __x86_64 ) || defined( __x86_64__ )
-    	#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
+    	  #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
          asm volatile( "nop\n" );
-	#else
+	      #else
          __asm__ __volatile__( "nop\n" );
        #endif
      #elif defined(__PPC64__)
@ -123,86 +120,22 @@ void host_thread_yield( const uint32_t i , const int force_yield )

  {
    // Insert memory pause
-      #if defined( __amd64 ) || defined( __amd64__ ) || \
-       	    defined( __x86_64 ) || defined( __x86_64__ )
-    	#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
+      #if defined( __amd64 )  || defined( __amd64__ ) || \
+       	  defined( __x86_64 ) || defined( __x86_64__ )
+    	  #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
          asm volatile( "pause\n":::"memory" );
-	#else
+	      #else
          __asm__ __volatile__( "pause\n":::"memory" );
        #endif
      #elif defined(__PPC64__)
-	asm volatile( "or 27, 27, 27" ::: "memory" );
+	      asm volatile( "or 27, 27, 27" ::: "memory" );
      #endif
  }

  #endif /* defined( KOKKOS_ENABLE_ASM ) */
 }

-}}} // namespace Kokkos::Impl::{anonymous}
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
-  Kokkos::load_fence();
-}
-
-void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
-  Kokkos::load_fence();
-}
-
-void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
-  Kokkos::load_fence();
-}
-
-void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
-  Kokkos::load_fence();
-}
-
-void yield_while_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
-  Kokkos::load_fence();
-}
-
-void yield_until_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
-  Kokkos::load_fence();
-}
-
-void yield_while_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
-  Kokkos::load_fence();
-}
-
-void yield_until_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
-  Kokkos::load_fence();
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
+}} // namespace Kokkos::Impl

 #else
 void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
--- a/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp
@ -46,47 +46,95 @@
 #define KOKKOS_SPINWAIT_HPP

 #include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>

 #include <cstdint>

+#include <type_traits>
+
 namespace Kokkos {
 namespace Impl {

 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )

-void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
-void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
+enum class WaitMode : int {
+    ACTIVE   // Used for tight loops to keep threads active longest
+  , PASSIVE  // Used to quickly yield the thread to quite down the system
+};

-void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
-void spinwait_until_equal( volatile int64_t & flag , const int64_t value );

-void yield_while_equal( volatile int32_t & flag , const int32_t value );
-void yield_until_equal( volatile int32_t & flag , const int32_t value );
+void host_thread_yield( const uint32_t i , const WaitMode mode );

-void yield_while_equal( volatile int64_t & flag , const int64_t value );
-void yield_until_equal( volatile int64_t & flag , const int64_t value );
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+spinwait_while_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value == flag ) {
+    host_thread_yield(++i, WaitMode::ACTIVE);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+yield_while_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value == flag ) {
+    host_thread_yield(++i, WaitMode::PASSIVE);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+spinwait_until_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value != flag ) {
+    host_thread_yield(++i, WaitMode::ACTIVE);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+yield_until_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value != flag ) {
+    host_thread_yield(++i, WaitMode::PASSIVE);
+  }
+  Kokkos::load_fence();
+}

 #else

+template <typename T>
 KOKKOS_INLINE_FUNCTION
-void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
-KOKKOS_INLINE_FUNCTION
-void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
+typename std::enable_if< std::is_integral<T>::value, void>::type
+spinwait_while_equal( T const volatile & flag, const T value ) {}

+template <typename T>
 KOKKOS_INLINE_FUNCTION
-void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
-KOKKOS_INLINE_FUNCTION
-void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
+typename std::enable_if< std::is_integral<T>::value, void>::type
+yield_while_equal( T const volatile & flag, const T value ) {}

+template <typename T>
 KOKKOS_INLINE_FUNCTION
-void yield_while_equal( volatile int32_t & , const int32_t ) {}
-KOKKOS_INLINE_FUNCTION
-void yield_until_equal( volatile int32_t & , const int32_t ) {}
+typename std::enable_if< std::is_integral<T>::value, void>::type
+spinwait_until_equal( T const volatile & flag, const T value ) {}

+template <typename T>
 KOKKOS_INLINE_FUNCTION
-void yield_while_equal( volatile int64_t & , const int64_t ) {}
-KOKKOS_INLINE_FUNCTION
-void yield_until_equal( volatile int64_t & , const int64_t ) {}
+typename std::enable_if< std::is_integral<T>::value, void>::type
+yield_until_equal( T const volatile & flag, const T value ) {}

 #endif

--- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@ -111,7 +111,9 @@ struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> >

  using type = CommonViewAllocProp<Specialize,T> ;

+  KOKKOS_INLINE_FUNCTION
  ViewCtorProp( const type & arg ) : value( arg ) {}
+  KOKKOS_INLINE_FUNCTION
  ViewCtorProp( type && arg ) : value( arg ) {}

  type value ;
@ -128,6 +130,7 @@ struct ViewCtorProp< void , std::integral_constant<unsigned,I> >
  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;

  template< typename P >
+  KOKKOS_INLINE_FUNCTION
  ViewCtorProp( const P & ) {}
 };

--- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@ -369,9 +369,9 @@ private:

  template< size_t ... DimArgs >
  KOKKOS_FORCEINLINE_FUNCTION
-  bool set( unsigned domain_rank
-          , unsigned range_rank
-          , const ViewDimension< DimArgs ... > & dim )
+  bool set( unsigned
+          , unsigned
+          , const ViewDimension< DimArgs ... > & )
    { return true ; }

  template< class T , size_t ... DimArgs , class ... Args >
@ -1047,7 +1047,7 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
  template< class DimRHS >
  KOKKOS_INLINE_FUNCTION
  constexpr ViewOffset(
-    const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ,
+    const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & ,
    const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub )
    : m_dim( sub.range_extent(0), 0, 0, 0, 0, 0, 0, 0 )
    {
@ -1252,7 +1252,7 @@ public:
  template< unsigned TrivialScalarSize >
  KOKKOS_INLINE_FUNCTION
  constexpr ViewOffset
-    ( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
+    ( std::integral_constant<unsigned,TrivialScalarSize> const &
    , Kokkos::LayoutLeft const & arg_layout
    )
    : m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
@ -1741,7 +1741,7 @@ public:
  template< unsigned TrivialScalarSize >
  KOKKOS_INLINE_FUNCTION
  constexpr ViewOffset
-    ( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
+    ( std::integral_constant<unsigned,TrivialScalarSize> const &
    , Kokkos::LayoutRight const & arg_layout
    )
    : m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
@ -2368,7 +2368,7 @@ struct ViewDataHandle< Traits ,
                          )>::type >
 {
  typedef typename Traits::value_type  value_type ;
-  typedef typename Traits::value_type * KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ;
+  typedef typename Traits::value_type * KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
  typedef typename Traits::value_type & return_type ;
  typedef Kokkos::Impl::SharedAllocationTracker  track_type  ;

@ -2376,7 +2376,7 @@ struct ViewDataHandle< Traits ,
  static handle_type assign( value_type * arg_data_ptr
                           , track_type const & /*arg_tracker*/ )
  {
-    if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
    }
    return handle_type( arg_data_ptr );
@ -2386,7 +2386,7 @@ struct ViewDataHandle< Traits ,
  static handle_type assign( handle_type const arg_data_ptr
                           , size_t offset )
  {
-    if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
    }
    return handle_type( arg_data_ptr + offset );
@ -2411,7 +2411,7 @@ struct ViewDataHandle< Traits ,
                          )>::type >
 {
  typedef typename Traits::value_type  value_type ;
-  typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ;
+  typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
  typedef typename Traits::value_type & return_type ;
  typedef Kokkos::Impl::SharedAllocationTracker  track_type  ;

@ -2419,7 +2419,7 @@ struct ViewDataHandle< Traits ,
  static handle_type assign( value_type * arg_data_ptr
                           , track_type const & /*arg_tracker*/ )
  {
-    if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
    }
    return handle_type( arg_data_ptr );
@ -2429,7 +2429,7 @@ struct ViewDataHandle< Traits ,
  static handle_type assign( handle_type const arg_data_ptr
                           , size_t offset )
  {
-    if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
    }
    return handle_type( arg_data_ptr + offset );
@ -2783,6 +2783,11 @@ public:
    , m_offset( std::integral_constant< unsigned , 0 >() , arg_layout )
    {}

+  /**\brief  Assign data */
+  KOKKOS_INLINE_FUNCTION
+  void assign_data( pointer_type arg_ptr )
+    { m_handle = handle_type( arg_ptr ); }
+
  //----------------------------------------
  /*  Allocate and construct mapped array.
   *  Allocate via shared allocation record and
--- a/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
@ -48,6 +48,7 @@
 #include <algorithm>

 #include <Kokkos_Macros.hpp>
+#include <Kokkos_Core.hpp>
 #include <Kokkos_hwloc.hpp>
 #include <impl/Kokkos_Error.hpp>

@ -312,14 +313,18 @@ Sentinel::Sentinel()
  hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );

  if ( hwloc_bitmap_iszero( s_process_binding ) ) {
-    std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl;
+    if (Kokkos::show_warnings() ) {
+      std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl;
+    }
    const int pu_depth = hwloc_get_type_depth( s_hwloc_topology, HWLOC_OBJ_PU );
    int num_pu = 1;
    if ( pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN ) {
      num_pu = hwloc_get_nbobjs_by_depth( s_hwloc_topology, pu_depth );
    }
    else {
-      std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl;
+      if (Kokkos::show_warnings() ) {
+        std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl;
+      }
      num_pu = 1;
    }
    hwloc_bitmap_set_range( s_process_binding, 0, num_pu-1);
@ -349,7 +354,7 @@ Sentinel::Sentinel()

      hwloc_bitmap_free( s_process_no_core_zero );

-      if ( ! ok ) {
+      if ( Kokkos::show_warnings() && ! ok ) {
        std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
      }
    }
@ -503,8 +508,8 @@ Sentinel::Sentinel()

  hwloc_bitmap_free( proc_cpuset_location );

-  if ( ! symmetric ) {
-    std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
+  if ( Kokkos::show_warnings() && ! symmetric ) {
+    std::cerr << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
              << std::endl ;
  }
 }