From eea14c55a9bbbe33423f2160f6949fc41ecb6a95 Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Tue, 22 Dec 2020 08:52:37 -0700 Subject: [PATCH] Update Kokkos library in LAMMPS to v3.3.0 --- lib/kokkos/BUILD.md | 13 +- lib/kokkos/CHANGELOG.md | 113 +- lib/kokkos/CMakeLists.txt | 19 +- lib/kokkos/Makefile.kokkos | 118 +- lib/kokkos/Makefile.targets | 13 - lib/kokkos/README.md | 16 +- lib/kokkos/algorithms/src/Kokkos_Random.hpp | 4 +- lib/kokkos/algorithms/src/Kokkos_Sort.hpp | 12 +- .../algorithms/unit_tests/CMakeLists.txt | 2 +- lib/kokkos/algorithms/unit_tests/Makefile | 16 +- lib/kokkos/benchmarks/atomic/Makefile | 43 +- .../benchmark_suite/scripts/run_tests.bash | 2 +- .../benchmarks/bytes_and_flops/Makefile | 4 +- lib/kokkos/benchmarks/gather/Makefile | 33 +- lib/kokkos/benchmarks/gups/Makefile | 50 +- .../gups/{gups-kokkos.cc => gups-kokkos.cpp} | 0 .../benchmarks/policy_performance/Makefile | 49 +- .../benchmarks/policy_performance/main.cpp | 6 +- lib/kokkos/benchmarks/stream/Makefile | 50 +- .../{stream-kokkos.cc => stream-kokkos.cpp} | 0 lib/kokkos/bin/kokkos_launch_compiler | 87 + lib/kokkos/bin/nvcc_wrapper | 9 +- lib/kokkos/cmake/KokkosConfig.cmake.in | 21 + lib/kokkos/cmake/KokkosConfigCommon.cmake.in | 70 + .../KokkosCore_Config_HeaderSet.in} | 8 +- lib/kokkos/cmake/KokkosCore_config.h.in | 4 +- lib/kokkos/cmake/README.md | 10 +- lib/kokkos/cmake/deps/CUDA.cmake | 35 +- lib/kokkos/cmake/deps/CUSPARSE.cmake | 35 +- lib/kokkos/cmake/deps/HWLOC.cmake | 35 +- lib/kokkos/cmake/deps/Pthread.cmake | 35 +- lib/kokkos/cmake/fake_tribits.cmake | 82 +- lib/kokkos/cmake/intel.cmake | 20 +- lib/kokkos/cmake/kokkos_arch.cmake | 79 +- lib/kokkos/cmake/kokkos_compiler_id.cmake | 98 +- lib/kokkos/cmake/kokkos_corner_cases.cmake | 2 +- lib/kokkos/cmake/kokkos_enable_devices.cmake | 38 +- lib/kokkos/cmake/kokkos_functions.cmake | 120 +- lib/kokkos/cmake/kokkos_pick_cxx_std.cmake | 12 +- lib/kokkos/cmake/kokkos_test_cxx_std.cmake | 29 +- lib/kokkos/cmake/kokkos_tpls.cmake | 4 + lib/kokkos/cmake/kokkos_tribits.cmake | 103 +- lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake | 35 +- lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake | 35 +- lib/kokkos/cmake/tpls/FindTPLPthread.cmake | 35 +- .../performance_tests/CMakeLists.txt | 58 +- .../containers/performance_tests/Makefile | 8 +- .../containers/performance_tests/TestCuda.cpp | 22 +- .../{TestROCm.cpp => TestHIP.cpp} | 45 +- .../containers/performance_tests/TestHPX.cpp | 26 +- .../containers/performance_tests/TestMain.cpp | 8 +- .../performance_tests/TestOpenMP.cpp | 27 +- .../performance_tests/TestThreads.cpp | 35 +- lib/kokkos/containers/src/Kokkos_Bitset.hpp | 4 +- lib/kokkos/containers/src/Kokkos_DualView.hpp | 58 +- .../containers/src/Kokkos_DynRankView.hpp | 34 +- .../containers/src/Kokkos_OffsetView.hpp | 16 +- .../containers/src/Kokkos_ScatterView.hpp | 67 +- .../containers/src/Kokkos_UnorderedMap.hpp | 15 +- .../containers/unit_tests/CMakeLists.txt | 12 +- lib/kokkos/containers/unit_tests/Makefile | 2 +- .../containers/unit_tests/TestDualView.hpp | 8 +- .../containers/unit_tests/TestDynViewAPI.hpp | 6 +- .../containers/unit_tests/TestDynamicView.hpp | 3 + .../containers/unit_tests/TestOffsetView.hpp | 27 +- .../unit_tests/TestSYCL_Category.hpp | 51 + .../containers/unit_tests/TestScatterView.hpp | 57 +- .../unit_tests/TestStaticCrsGraph.hpp | 8 +- .../containers/unit_tests/TestVector.hpp | 6 +- lib/kokkos/core/cmake/KokkosCore_config.h.in | 1 - lib/kokkos/core/perf_test/CMakeLists.txt | 33 +- lib/kokkos/core/perf_test/Makefile | 12 + .../core/perf_test/PerfTest_ViewResize.hpp | 10 +- .../perf_test/test_atomic_minmax_simple.cpp | 244 ++ lib/kokkos/core/src/CMakeLists.txt | 9 +- .../src/Cuda/KokkosExp_Cuda_IterateTile.hpp | 1397 -------- .../KokkosExp_Cuda_IterateTile_Refactor.hpp | 3063 ---------------- lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp | 94 +- .../Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp | 33 + .../src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp | 210 ++ .../Kokkos_Cuda_GraphNode_Impl.hpp} | 105 +- .../core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp | 219 ++ lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp | 710 ++++ .../core/src/Cuda/Kokkos_Cuda_Instance.cpp | 92 +- .../core/src/Cuda/Kokkos_Cuda_Instance.hpp | 4 +- .../src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 842 +++-- .../core/src/Cuda/Kokkos_Cuda_Locks.cpp | 5 +- .../core/src/Cuda/Kokkos_Cuda_Locks.hpp | 4 - .../core/src/Cuda/Kokkos_Cuda_Parallel.hpp | 303 +- .../core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp | 4 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp | 3 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp | 54 +- .../src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp | 2 + .../core/src/Cuda/Kokkos_Cuda_abort.hpp | 4 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp | 88 +- .../HIP/Kokkos_HIP_BlockSize_Deduction.hpp | 94 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp | 4 +- .../core/src/HIP/Kokkos_HIP_Instance.cpp | 90 +- .../core/src/HIP/Kokkos_HIP_Instance.hpp | 26 +- .../core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 49 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp | 20 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp | 29 +- .../src/HIP/Kokkos_HIP_Parallel_MDRange.hpp | 45 +- .../src/HIP/Kokkos_HIP_Parallel_Range.hpp | 12 +- .../core/src/HIP/Kokkos_HIP_Parallel_Team.hpp | 402 +-- .../src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp | 6 - lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp | 165 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp | 48 +- .../core/src/HIP/Kokkos_HIP_Vectorization.hpp | 24 +- lib/kokkos/core/src/HPX/Kokkos_HPX.cpp | 56 +- .../core/src/KokkosExp_MDRangePolicy.hpp | 461 +-- lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp | 18 +- lib/kokkos/core/src/Kokkos_Atomic.hpp | 27 +- lib/kokkos/core/src/Kokkos_Complex.hpp | 264 +- lib/kokkos/core/src/Kokkos_Concepts.hpp | 1 + lib/kokkos/core/src/Kokkos_CopyViews.hpp | 18 +- lib/kokkos/core/src/Kokkos_Core.hpp | 82 +- lib/kokkos/core/src/Kokkos_Core_fwd.hpp | 71 +- lib/kokkos/core/src/Kokkos_Crs.hpp | 2 +- lib/kokkos/core/src/Kokkos_Cuda.hpp | 31 +- lib/kokkos/core/src/Kokkos_CudaSpace.hpp | 130 +- lib/kokkos/core/src/Kokkos_ExecPolicy.hpp | 162 +- lib/kokkos/core/src/Kokkos_Future.hpp | 2 +- lib/kokkos/core/src/Kokkos_Graph.hpp | 191 + lib/kokkos/core/src/Kokkos_GraphNode.hpp | 462 +++ .../Kokkos_Graph_fwd.hpp} | 26 +- lib/kokkos/core/src/Kokkos_HBWSpace.hpp | 48 +- lib/kokkos/core/src/Kokkos_HIP_Space.hpp | 103 +- lib/kokkos/core/src/Kokkos_HPX.hpp | 87 +- lib/kokkos/core/src/Kokkos_Half.hpp | 119 + lib/kokkos/core/src/Kokkos_HostSpace.hpp | 19 +- lib/kokkos/core/src/Kokkos_Layout.hpp | 8 +- lib/kokkos/core/src/Kokkos_LogicalSpaces.hpp | 428 +++ lib/kokkos/core/src/Kokkos_Macros.hpp | 190 +- lib/kokkos/core/src/Kokkos_OpenMP.hpp | 23 +- lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp | 17 +- .../core/src/Kokkos_OpenMPTargetSpace.hpp | 2 +- lib/kokkos/core/src/Kokkos_Parallel.hpp | 105 +- .../core/src/Kokkos_Parallel_Reduce.hpp | 76 +- lib/kokkos/core/src/Kokkos_ROCmSpace.hpp | 637 ---- .../src/{Kokkos_ROCm.hpp => Kokkos_SYCL.hpp} | 204 +- lib/kokkos/core/src/Kokkos_SYCL_Space.hpp | 181 + lib/kokkos/core/src/Kokkos_ScratchSpace.hpp | 36 +- lib/kokkos/core/src/Kokkos_Serial.hpp | 104 +- lib/kokkos/core/src/Kokkos_Threads.hpp | 23 +- lib/kokkos/core/src/Kokkos_Tuners.hpp | 477 +++ lib/kokkos/core/src/Kokkos_View.hpp | 39 +- .../core/src/OpenMP/Kokkos_OpenMP_Exec.cpp | 46 + .../core/src/OpenMP/Kokkos_OpenMP_Exec.hpp | 3 +- .../core/src/OpenMP/Kokkos_OpenMP_Team.hpp | 79 +- .../OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp | 18 +- .../Kokkos_OpenMPTarget_Abort.hpp} | 22 +- .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 5 +- .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp | 729 +++- .../Kokkos_OpenMPTarget_Instance.cpp | 104 +- .../Kokkos_OpenMPTarget_Instance.hpp | 51 +- .../Kokkos_OpenMPTarget_Parallel.hpp | 722 +--- .../Kokkos_OpenMPTarget_Parallel_MDRange.hpp | 137 +- .../OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp | 1 - .../KokkosExp_ROCm_IterateTile_Refactor.hpp | 3130 ----------------- .../core/src/ROCm/Kokkos_ROCm_Atomic.hpp | 534 --- lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp | 129 - lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp | 265 -- lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp | 723 ---- .../core/src/ROCm/Kokkos_ROCm_Parallel.hpp | 1714 --------- .../core/src/ROCm/Kokkos_ROCm_Reduce.hpp | 182 - .../core/src/ROCm/Kokkos_ROCm_ReduceScan.hpp | 690 ---- lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp | 250 -- .../core/src/ROCm/Kokkos_ROCm_Space.cpp | 639 ---- lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.cpp | 168 - lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.hpp | 448 --- lib/kokkos/core/src/ROCm/Kokkos_ROCm_Tile.hpp | 452 --- .../src/ROCm/Kokkos_ROCm_Vectorization.hpp | 350 -- lib/kokkos/core/src/ROCm/hc_math_std.hpp | 371 -- lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp | 274 ++ .../core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp | 137 + .../core/src/SYCL/Kokkos_SYCL_Instance.cpp | 127 + .../SYCL/Kokkos_SYCL_Instance.hpp} | 82 +- .../src/SYCL/Kokkos_SYCL_Parallel_Range.hpp | 133 + .../src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp | 300 ++ .../src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp | 324 ++ .../core/src/SYCL/Kokkos_SYCL_Space.cpp | 438 +++ .../core/src/Threads/Kokkos_ThreadsExec.cpp | 53 +- .../core/src/Threads/Kokkos_ThreadsTeam.hpp | 143 +- .../src/Threads/Kokkos_Threads_Parallel.hpp | 41 +- .../core/src/decl/Kokkos_Declare_CUDA.hpp | 52 + .../core/src/decl/Kokkos_Declare_HBWSpace.hpp | 52 + .../decl/Kokkos_Declare_HIP.hpp} | 13 +- .../core/src/decl/Kokkos_Declare_HPX.hpp | 52 + .../core/src/decl/Kokkos_Declare_OPENMP.hpp | 52 + .../decl/Kokkos_Declare_OPENMPTARGET.hpp} | 15 +- .../core/src/decl/Kokkos_Declare_SERIAL.hpp | 52 + .../core/src/decl/Kokkos_Declare_SYCL.hpp | 56 + .../core/src/decl/Kokkos_Declare_THREADS.hpp | 52 + .../Kokkos_Fwd_CUDA.hpp} | 0 .../core/src/fwd/Kokkos_Fwd_HBWSpace.hpp | 57 + lib/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp | 56 + lib/kokkos/core/src/fwd/Kokkos_Fwd_HPX.hpp | 55 + .../Kokkos_Fwd_OPENMP.hpp} | 11 +- .../core/src/fwd/Kokkos_Fwd_OPENMPTARGET.hpp | 56 + lib/kokkos/core/src/fwd/Kokkos_Fwd_SERIAL.hpp | 53 + lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp | 56 + .../core/src/fwd/Kokkos_Fwd_THREADS.hpp | 53 + .../KokkosExp_IterateTileGPU.hpp} | 215 +- .../core/src/impl/Kokkos_AnalyzePolicy.hpp | 213 +- .../Kokkos_Atomic_Compare_Exchange_Strong.hpp | 18 +- .../Kokkos_Atomic_Compare_Exchange_Weak.hpp | 8 +- .../core/src/impl/Kokkos_Atomic_Fetch_Add.hpp | 11 +- .../core/src/impl/Kokkos_Atomic_Fetch_And.hpp | 26 + .../core/src/impl/Kokkos_Atomic_Fetch_Or.hpp | 27 + .../core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp | 11 +- .../core/src/impl/Kokkos_Atomic_Generic.hpp | 69 +- .../core/src/impl/Kokkos_Atomic_Windows.hpp | 5 +- lib/kokkos/core/src/impl/Kokkos_BitOps.hpp | 8 - lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp | 3 - .../core/src/impl/Kokkos_Combined_Reducer.hpp | 57 +- .../core/src/impl/Kokkos_ConcurrentBitset.hpp | 7 + lib/kokkos/core/src/impl/Kokkos_Core.cpp | 464 +-- .../impl/Kokkos_Default_GraphNodeKernel.hpp | 125 + .../impl/Kokkos_Default_GraphNode_Impl.hpp | 170 + .../src/impl/Kokkos_Default_Graph_Impl.hpp | 197 ++ .../impl/Kokkos_Default_Graph_fwd.hpp} | 23 +- lib/kokkos/core/src/impl/Kokkos_EBO.hpp | 72 + lib/kokkos/core/src/impl/Kokkos_Error.cpp | 3 + lib/kokkos/core/src/impl/Kokkos_Error.hpp | 11 +- .../Kokkos_ExecSpaceInitializer.hpp} | 30 +- .../core/src/impl/Kokkos_FunctorAdapter.hpp | 70 +- .../core/src/impl/Kokkos_FunctorAnalysis.hpp | 26 +- lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp | 156 + .../src/impl/Kokkos_GraphImpl_Utilities.hpp | 119 + .../impl/Kokkos_GraphImpl_fwd.hpp} | 57 +- .../impl/Kokkos_GraphNodeCustomization.hpp} | 76 +- .../core/src/impl/Kokkos_GraphNodeImpl.hpp | 298 ++ lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp | 39 +- lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp | 29 +- .../core/src/impl/Kokkos_HostThreadTeam.hpp | 13 - .../core/src/impl/Kokkos_Memory_Fence.hpp | 2 - lib/kokkos/core/src/impl/Kokkos_Profiling.cpp | 176 +- lib/kokkos/core/src/impl/Kokkos_Profiling.hpp | 357 +- .../src/impl/Kokkos_Profiling_C_Interface.h | 17 +- .../src/impl/Kokkos_Profiling_Interface.hpp | 15 +- lib/kokkos/core/src/impl/Kokkos_Serial.cpp | 43 +- .../core/src/impl/Kokkos_SharedAlloc.cpp | 44 +- .../core/src/impl/Kokkos_SharedAlloc.hpp | 6 +- .../src/impl/Kokkos_SimpleTaskScheduler.hpp | 74 +- .../core/src/impl/Kokkos_TaskQueueCommon.hpp | 4 +- .../src/impl/Kokkos_TaskQueueMultiple.hpp | 2 +- .../core/src/impl/Kokkos_TaskQueue_impl.hpp | 4 +- lib/kokkos/core/src/impl/Kokkos_Traits.hpp | 72 +- lib/kokkos/core/src/impl/Kokkos_Utilities.hpp | 160 +- .../core/src/impl/Kokkos_VLAEmulation.hpp | 3 +- lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp | 29 +- lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp | 56 +- .../core/src/impl/Kokkos_ViewLayoutTiled.hpp | 121 +- .../core/src/impl/Kokkos_ViewMapping.hpp | 44 +- .../core/src/setup/Kokkos_Setup_Cuda.hpp | 138 + .../core/src/setup/Kokkos_Setup_HIP.hpp | 71 + lib/kokkos/core/unit_test/CMakeLists.txt | 374 +- lib/kokkos/core/unit_test/Makefile | 126 +- lib/kokkos/core/unit_test/TestAtomicViews.hpp | 3 + .../{TestAtomic.hpp => TestAtomics.hpp} | 0 .../TestBlockSizeDeduction.hpp} | 52 +- lib/kokkos/core/unit_test/TestCXX11.hpp | 14 +- .../core/unit_test/TestCompilerMacros.hpp | 2 +- lib/kokkos/core/unit_test/TestComplex.hpp | 86 +- lib/kokkos/core/unit_test/TestConcepts.hpp | 6 + .../core/unit_test/TestCudaUVM_Category.hpp | 54 + ..._SubView_c01.cpp => TestCuda_Category.hpp} | 15 +- ...DeepCopy.hpp => TestDeepCopyAlignment.hpp} | 0 .../unit_test/TestDefaultDeviceTypeInit.hpp | 150 +- lib/kokkos/core/unit_test/TestGraph.hpp | 253 ++ ...ory.hpp => TestHIPHostPinned_Category.hpp} | 9 +- .../core/unit_test/TestHIP_Category.hpp | 54 + ...nned_Category.hpp => TestHPX_Category.hpp} | 11 +- ...TeamScratch.cpp => TestHalfConversion.hpp} | 69 +- .../core/unit_test/TestHalfOperators.hpp | 361 ++ lib/kokkos/core/unit_test/TestInit.hpp | 3 + .../core/unit_test/TestIrregularLayout.hpp | 2 +- lib/kokkos/core/unit_test/TestMDRange.hpp | 239 +- lib/kokkos/core/unit_test/TestMDRange_a.hpp | 3 - lib/kokkos/core/unit_test/TestMDRange_b.hpp | 3 - lib/kokkos/core/unit_test/TestMDRange_c.hpp | 3 - lib/kokkos/core/unit_test/TestMDRange_d.hpp | 3 - lib/kokkos/core/unit_test/TestMDRange_e.hpp | 3 - ..._MDRangeReduce_c.cpp => TestMDRange_f.hpp} | 7 +- .../unit_test/TestOpenMPTarget_Category.hpp | 55 + .../core/unit_test/TestOpenMP_Category.hpp | 56 + .../TestCuda_Other.cpp => TestOther.hpp} | 5 +- .../core/unit_test/TestPolicyConstruction.hpp | 196 +- lib/kokkos/core/unit_test/TestRange.hpp | 34 +- ...IP_RangePolicy.cpp => TestRangePolicy.hpp} | 5 +- ...Require.hpp => TestRangePolicyRequire.hpp} | 20 +- lib/kokkos/core/unit_test/TestReduce.hpp | 30 + .../unit_test/TestReduceCombinatorical.hpp | 191 +- lib/kokkos/core/unit_test/TestReducers.hpp | 94 +- ...Cuda_Reductions.cpp => TestReductions.hpp} | 7 +- ...View.hpp => TestReductions_DeviceView.hpp} | 0 lib/kokkos/core/unit_test/TestResize.hpp | 6 + .../core/unit_test/TestSYCL_Category.hpp | 53 + lib/kokkos/core/unit_test/TestScan.hpp | 33 +- .../core/unit_test/TestSerial_Category.hpp | 56 + lib/kokkos/core/unit_test/TestSharedAlloc.hpp | 26 +- ...estHIP_SubView_a.cpp => TestSubView_a.hpp} | 6 +- ...stCuda_SubView_b.cpp => TestSubView_b.hpp} | 4 +- ...IP_SubView_c01.cpp => TestSubView_c01.hpp} | 4 +- ...da_SubView_c02.cpp => TestSubView_c02.hpp} | 4 +- ...IP_SubView_c03.cpp => TestSubView_c03.hpp} | 4 +- ...da_SubView_c04.cpp => TestSubView_c04.hpp} | 4 +- ...da_SubView_c05.cpp => TestSubView_c05.hpp} | 4 +- ...da_SubView_c06.cpp => TestSubView_c06.hpp} | 4 +- ...da_SubView_c07.cpp => TestSubView_c07.hpp} | 4 +- ...da_SubView_c08.cpp => TestSubView_c08.hpp} | 4 +- ...IP_SubView_c09.cpp => TestSubView_c09.hpp} | 4 +- ...IP_SubView_c10.cpp => TestSubView_c10.hpp} | 4 +- ...PX_SubView_c11.cpp => TestSubView_c11.hpp} | 4 +- ...IP_SubView_c12.cpp => TestSubView_c12.hpp} | 4 +- ...da_SubView_c13.cpp => TestSubView_c13.hpp} | 4 +- .../core/unit_test/TestTaskScheduler.hpp | 4 +- lib/kokkos/core/unit_test/TestTeam.hpp | 44 +- lib/kokkos/core/unit_test/TestTeamBasic.hpp | 225 ++ ...tionScan.cpp => TestTeamReductionScan.hpp} | 68 +- lib/kokkos/core/unit_test/TestTeamScan.hpp | 182 + ...ds_TeamScratch.cpp => TestTeamScratch.hpp} | 31 +- .../core/unit_test/TestTeamTeamSize.hpp | 13 +- lib/kokkos/core/unit_test/TestTeamVector.hpp | 12 +- .../core/unit_test/TestThreads_Category.hpp | 55 + lib/kokkos/core/unit_test/TestUniqueToken.hpp | 33 + lib/kokkos/core/unit_test/TestUtilities.hpp | 36 + lib/kokkos/core/unit_test/TestViewAPI.hpp | 9 + lib/kokkos/core/unit_test/TestViewAPI_c.hpp | 3 + lib/kokkos/core/unit_test/TestViewAPI_e.hpp | 5 +- .../{TestViewCopy.hpp => TestViewCopy_a.hpp} | 200 +- lib/kokkos/core/unit_test/TestViewCopy_b.hpp | 268 ++ .../unit_test/TestViewCtorPropEmbeddedDim.hpp | 6 + .../TestViewLayoutStrideAssignment.hpp | 14 +- .../core/unit_test/TestViewMapping_a.hpp | 5 +- .../core/unit_test/TestViewMapping_b.hpp | 10 + .../unit_test/TestViewMapping_subview.hpp | 6 +- lib/kokkos/core/unit_test/TestViewSubview.hpp | 8 +- lib/kokkos/core/unit_test/TestWorkGraph.hpp | 5 + .../UnitTest_CMakePassCmdLineArgs.cpp | 11 + .../configuration/test-code/CMakeLists.txt | 4 +- .../cuda/TestCudaHostPinned_SharedAlloc.cpp | 8 - ....cpp => TestCudaHostPinned_ViewCopy_a.cpp} | 2 +- ....cpp => TestCudaHostPinned_ViewCopy_b.cpp} | 4 +- .../cuda/TestCudaUVM_SharedAlloc.cpp | 8 - ...iewCopy.cpp => TestCudaUVM_ViewCopy_a.cpp} | 2 +- ...Complex.cpp => TestCudaUVM_ViewCopy_b.cpp} | 5 +- ...estCuda_AtomicOperations_complexdouble.cpp | 46 - ...TestCuda_AtomicOperations_complexfloat.cpp | 46 - .../cuda/TestCuda_AtomicOperations_double.cpp | 46 - .../cuda/TestCuda_AtomicOperations_float.cpp | 46 - .../TestCuda_AtomicOperations_longint.cpp | 46 - .../TestCuda_AtomicOperations_longlongint.cpp | 46 - .../TestCuda_AtomicOperations_unsignedint.cpp | 46 - ...tCuda_AtomicOperations_unsignedlongint.cpp | 46 - .../core/unit_test/cuda/TestCuda_Category.hpp | 1 + .../core/unit_test/cuda/TestCuda_Concepts.cpp | 46 - .../core/unit_test/cuda/TestCuda_Crs.cpp | 46 - .../cuda/TestCuda_DeepCopyAlignment.cpp | 46 - ...uda_RangePolicy.cpp => TestCuda_Graph.cpp} | 2 +- .../unit_test/cuda/TestCuda_LocalDeepCopy.cpp | 46 - .../unit_test/cuda/TestCuda_MDRange_a.cpp | 47 - .../unit_test/cuda/TestCuda_MDRange_b.cpp | 47 - .../unit_test/cuda/TestCuda_MDRange_c.cpp | 47 - .../unit_test/cuda/TestCuda_MDRange_d.cpp | 47 - .../unit_test/cuda/TestCuda_MDRange_e.cpp | 47 - .../cuda/TestCuda_RangePolicyRequire.cpp | 47 - .../unit_test/cuda/TestCuda_Reducers_a.cpp | 46 - .../unit_test/cuda/TestCuda_Reducers_b.cpp | 46 - .../unit_test/cuda/TestCuda_Reducers_c.cpp | 46 - .../unit_test/cuda/TestCuda_Reducers_d.cpp | 46 - .../cuda/TestCuda_Reductions_DeviceView.cpp | 46 - .../core/unit_test/cuda/TestCuda_Scan.cpp | 47 - .../unit_test/cuda/TestCuda_SharedAlloc.cpp | 54 - .../unit_test/cuda/TestCuda_SubView_a.cpp | 102 - .../unit_test/cuda/TestCuda_SubView_c03.cpp | 55 - .../unit_test/cuda/TestCuda_SubView_c10.cpp | 54 - .../unit_test/cuda/TestCuda_SubView_c11.cpp | 55 - .../unit_test/cuda/TestCuda_SubView_c12.cpp | 55 - .../unit_test/cuda/TestCuda_SubView_c_all.cpp | 13 - .../core/unit_test/cuda/TestCuda_Team.cpp | 180 - .../cuda/TestCuda_TeamScratchStreams.cpp | 147 + .../unit_test/cuda/TestCuda_TeamTeamSize.cpp | 46 - .../cuda/TestCuda_TeamVectorRange.cpp | 47 - .../unit_test/cuda/TestCuda_UniqueToken.cpp | 46 - .../unit_test/cuda/TestCuda_ViewAPI_a.cpp | 46 - .../unit_test/cuda/TestCuda_ViewAPI_b.cpp | 46 - .../unit_test/cuda/TestCuda_ViewAPI_c.cpp | 46 - .../unit_test/cuda/TestCuda_ViewAPI_d.cpp | 46 - .../unit_test/cuda/TestCuda_ViewAPI_e.cpp | 47 - .../TestCuda_ViewLayoutStrideAssignment.cpp | 47 - .../unit_test/cuda/TestCuda_ViewMapping_a.cpp | 46 - .../unit_test/cuda/TestCuda_ViewMapping_b.cpp | 46 - .../cuda/TestCuda_ViewMapping_subview.cpp | 46 - .../unit_test/cuda/TestCuda_ViewOfClass.cpp | 46 - .../unit_test/cuda/TestCuda_ViewResize.cpp | 46 - .../unit_test/cuda/TestCuda_View_64bit.cpp | 46 - .../unit_test/cuda/TestCuda_WorkGraph.cpp | 46 - .../default/TestDefaultDeviceType.cpp | 2 + .../default/TestDefaultDeviceTypeInit_17.cpp | 2 + .../default/TestDefaultDeviceTypeInit_18.cpp | 2 + .../default/TestDefaultDeviceTypeResize.cpp | 3 + .../default/TestDefaultDeviceType_a1.cpp | 9 +- .../default/TestDefaultDeviceType_a2.cpp | 8 + .../default/TestDefaultDeviceType_a3.cpp | 3 + .../default/TestDefaultDeviceType_b1.cpp | 9 +- .../default/TestDefaultDeviceType_b2.cpp | 8 + .../default/TestDefaultDeviceType_b3.cpp | 3 + .../default/TestDefaultDeviceType_c1.cpp | 9 +- .../default/TestDefaultDeviceType_c2.cpp | 8 +- .../default/TestDefaultDeviceType_c3.cpp | 3 +- .../headers_self_contained/CMakeLists.txt | 35 +- .../hip/TestHIPHostPinned_SharedAlloc.cpp | 54 - ...y.cpp => TestHIPHostPinned_ViewCopy_a.cpp} | 2 +- .../TestHIPHostPinned_ViewCopy_b.cpp} | 4 +- ...TestHIP_AtomicOperations_complexdouble.cpp | 46 - .../TestHIP_AtomicOperations_complexfloat.cpp | 46 - .../hip/TestHIP_AtomicOperations_double.cpp | 46 - .../hip/TestHIP_AtomicOperations_float.cpp | 46 - .../hip/TestHIP_AtomicOperations_int.cpp | 46 - .../hip/TestHIP_AtomicOperations_longint.cpp | 46 - .../TestHIP_AtomicOperations_longlongint.cpp | 46 - .../TestHIP_AtomicOperations_unsignedint.cpp | 46 - ...stHIP_AtomicOperations_unsignedlongint.cpp | 46 - .../unit_test/hip/TestHIP_AtomicViews.cpp | 47 - .../core/unit_test/hip/TestHIP_Atomics.cpp | 46 - .../core/unit_test/hip/TestHIP_Complex.cpp | 47 - .../core/unit_test/hip/TestHIP_Concepts.cpp | 46 - lib/kokkos/core/unit_test/hip/TestHIP_Crs.cpp | 46 - .../hip/TestHIP_DeepCopyAlignment.cpp | 46 - .../unit_test/hip/TestHIP_FunctorAnalysis.cpp | 47 - .../core/unit_test/hip/TestHIP_Init.cpp | 49 - .../unit_test/hip/TestHIP_InterOp_Init.cpp | 6 +- .../unit_test/hip/TestHIP_InterOp_Streams.cpp | 8 +- .../unit_test/hip/TestHIP_LocalDeepCopy.cpp | 46 - .../core/unit_test/hip/TestHIP_MDRange_a.cpp | 47 - .../core/unit_test/hip/TestHIP_MDRange_b.cpp | 47 - .../core/unit_test/hip/TestHIP_MDRange_c.cpp | 47 - .../core/unit_test/hip/TestHIP_MDRange_d.cpp | 47 - .../core/unit_test/hip/TestHIP_MDRange_e.cpp | 47 - .../core/unit_test/hip/TestHIP_Other.cpp | 53 - .../hip/TestHIP_RangePolicyRequire.cpp | 47 - .../core/unit_test/hip/TestHIP_Reducers_a.cpp | 46 - .../core/unit_test/hip/TestHIP_Reducers_b.cpp | 46 - .../core/unit_test/hip/TestHIP_Reducers_c.cpp | 46 - .../core/unit_test/hip/TestHIP_Reducers_d.cpp | 46 - .../core/unit_test/hip/TestHIP_Reductions.cpp | 47 - .../hip/TestHIP_Reductions_DeviceView.cpp | 46 - .../core/unit_test/hip/TestHIP_Scan.cpp | 47 - .../core/unit_test/hip/TestHIP_ScanUnit.cpp | 2 +- .../unit_test/hip/TestHIP_SharedAlloc.cpp | 55 - .../core/unit_test/hip/TestHIP_SubView_b.cpp | 66 - .../unit_test/hip/TestHIP_SubView_c02.cpp | 55 - .../unit_test/hip/TestHIP_SubView_c04.cpp | 54 - .../unit_test/hip/TestHIP_SubView_c05.cpp | 55 - .../unit_test/hip/TestHIP_SubView_c06.cpp | 55 - .../unit_test/hip/TestHIP_SubView_c07.cpp | 54 - .../unit_test/hip/TestHIP_SubView_c08.cpp | 55 - .../unit_test/hip/TestHIP_SubView_c11.cpp | 55 - .../unit_test/hip/TestHIP_SubView_c13.cpp | 54 - .../core/unit_test/hip/TestHIP_Team.cpp | 152 - .../hip/TestHIP_TeamReductionScan.cpp | 82 - .../unit_test/hip/TestHIP_TeamScratch.cpp | 82 - .../hip/TestHIP_TeamScratchStreams.cpp | 152 + .../unit_test/hip/TestHIP_UniqueToken.cpp | 46 - .../core/unit_test/hip/TestHIP_ViewAPI_a.cpp | 46 - .../core/unit_test/hip/TestHIP_ViewAPI_b.cpp | 46 - .../core/unit_test/hip/TestHIP_ViewAPI_c.cpp | 46 - .../core/unit_test/hip/TestHIP_ViewAPI_d.cpp | 46 - .../core/unit_test/hip/TestHIP_ViewAPI_e.cpp | 47 - .../TestHIP_ViewLayoutStrideAssignment.cpp | 47 - .../unit_test/hip/TestHIP_ViewMapping_a.cpp | 46 - .../unit_test/hip/TestHIP_ViewMapping_b.cpp | 46 - .../hip/TestHIP_ViewMapping_subview.cpp | 46 - .../unit_test/hip/TestHIP_ViewOfClass.cpp | 46 - .../core/unit_test/hip/TestHIP_ViewResize.cpp | 46 - .../core/unit_test/hip/TestHIP_View_64bit.cpp | 46 - .../core/unit_test/hip/TestHIP_WorkGraph.cpp | 46 - ...TestHPX_AtomicOperations_complexdouble.cpp | 46 - .../TestHPX_AtomicOperations_complexfloat.cpp | 46 - .../hpx/TestHPX_AtomicOperations_double.cpp | 46 - .../hpx/TestHPX_AtomicOperations_float.cpp | 46 - .../hpx/TestHPX_AtomicOperations_int.cpp | 46 - .../hpx/TestHPX_AtomicOperations_longint.cpp | 46 - .../TestHPX_AtomicOperations_longlongint.cpp | 46 - .../TestHPX_AtomicOperations_unsignedint.cpp | 46 - ...stHPX_AtomicOperations_unsignedlongint.cpp | 46 - .../unit_test/hpx/TestHPX_AtomicViews.cpp | 47 - .../core/unit_test/hpx/TestHPX_Atomics.cpp | 46 - .../core/unit_test/hpx/TestHPX_Complex.cpp | 47 - .../core/unit_test/hpx/TestHPX_Concepts.cpp | 46 - lib/kokkos/core/unit_test/hpx/TestHPX_Crs.cpp | 46 - .../hpx/TestHPX_DeepCopyAlignment.cpp | 46 - .../unit_test/hpx/TestHPX_FunctorAnalysis.cpp | 47 - .../core/unit_test/hpx/TestHPX_Init.cpp | 49 - .../unit_test/hpx/TestHPX_LocalDeepCopy.cpp | 46 - .../core/unit_test/hpx/TestHPX_MDRange_a.cpp | 47 - .../core/unit_test/hpx/TestHPX_MDRange_b.cpp | 47 - .../core/unit_test/hpx/TestHPX_MDRange_c.cpp | 47 - .../core/unit_test/hpx/TestHPX_MDRange_d.cpp | 47 - .../core/unit_test/hpx/TestHPX_MDRange_e.cpp | 47 - .../core/unit_test/hpx/TestHPX_Other.cpp | 44 - .../unit_test/hpx/TestHPX_RangePolicy.cpp | 47 - .../hpx/TestHPX_RangePolicyRequire.cpp | 47 - .../core/unit_test/hpx/TestHPX_Reducers_a.cpp | 46 - .../core/unit_test/hpx/TestHPX_Reducers_b.cpp | 46 - .../core/unit_test/hpx/TestHPX_Reducers_c.cpp | 46 - .../core/unit_test/hpx/TestHPX_Reducers_d.cpp | 46 - .../core/unit_test/hpx/TestHPX_Reductions.cpp | 47 - .../hpx/TestHPX_Reductions_DeviceView.cpp | 46 - .../core/unit_test/hpx/TestHPX_Scan.cpp | 47 - .../core/unit_test/hpx/TestHPX_SubView_a.cpp | 94 - .../core/unit_test/hpx/TestHPX_SubView_b.cpp | 66 - .../unit_test/hpx/TestHPX_SubView_c01.cpp | 54 - .../unit_test/hpx/TestHPX_SubView_c02.cpp | 55 - .../unit_test/hpx/TestHPX_SubView_c03.cpp | 55 - .../unit_test/hpx/TestHPX_SubView_c04.cpp | 54 - .../unit_test/hpx/TestHPX_SubView_c05.cpp | 55 - .../unit_test/hpx/TestHPX_SubView_c06.cpp | 55 - .../unit_test/hpx/TestHPX_SubView_c07.cpp | 54 - .../unit_test/hpx/TestHPX_SubView_c08.cpp | 55 - .../unit_test/hpx/TestHPX_SubView_c10.cpp | 54 - .../unit_test/hpx/TestHPX_SubView_c13.cpp | 54 - .../unit_test/hpx/TestHPX_SubView_c_all.cpp | 13 - .../core/unit_test/hpx/TestHPX_Team.cpp | 83 - .../hpx/TestHPX_TeamReductionScan.cpp | 81 - .../unit_test/hpx/TestHPX_TeamScratch.cpp | 78 - .../unit_test/hpx/TestHPX_TeamTeamSize.cpp | 46 - .../unit_test/hpx/TestHPX_TeamVectorRange.cpp | 47 - .../unit_test/hpx/TestHPX_UniqueToken.cpp | 46 - .../core/unit_test/hpx/TestHPX_ViewAPI_a.cpp | 46 - .../core/unit_test/hpx/TestHPX_ViewAPI_b.cpp | 46 - .../core/unit_test/hpx/TestHPX_ViewAPI_c.cpp | 46 - .../core/unit_test/hpx/TestHPX_ViewAPI_d.cpp | 46 - .../core/unit_test/hpx/TestHPX_ViewAPI_e.cpp | 46 - .../TestHPX_ViewLayoutStrideAssignment.cpp | 47 - .../unit_test/hpx/TestHPX_ViewMapping_a.cpp | 46 - .../unit_test/hpx/TestHPX_ViewMapping_b.cpp | 46 - .../hpx/TestHPX_ViewMapping_subview.cpp | 46 - .../unit_test/hpx/TestHPX_ViewOfClass.cpp | 46 - .../core/unit_test/hpx/TestHPX_ViewResize.cpp | 46 - .../core/unit_test/hpx/TestHPX_View_64bit.cpp | 46 - .../core/unit_test/hpx/TestHPX_WorkGraph.cpp | 46 - .../Test04_ParallelFor_RangePolicy.hpp | 3 +- .../Test05_ParallelReduce_RangePolicy.hpp | 94 +- .../incremental/Test10_HierarchicalBasics.hpp | 9 + .../Test11a_ParallelFor_TeamThreadRange.hpp | 4 +- .../Test11b_ParallelFor_TeamVectorRange.hpp | 4 +- .../incremental/Test12a_ThreadScratch.hpp | 2 +- .../incremental/Test12b_TeamScratch.hpp | 2 +- .../Test13a_ParallelRed_TeamThreadRange.hpp | 2 +- .../Test13b_ParallelRed_TeamVectorRange.hpp | 2 +- .../Test13c_ParallelRed_ThreadVectorRange.hpp | 2 +- .../core/unit_test/openmp/TestOpenMP.hpp | 1 - ...tOpenMP_AtomicOperations_complexdouble.cpp | 46 - ...stOpenMP_AtomicOperations_complexfloat.cpp | 46 - .../TestOpenMP_AtomicOperations_double.cpp | 46 - .../TestOpenMP_AtomicOperations_float.cpp | 46 - .../TestOpenMP_AtomicOperations_int.cpp | 46 - .../TestOpenMP_AtomicOperations_longint.cpp | 46 - ...estOpenMP_AtomicOperations_longlongint.cpp | 46 - ...estOpenMP_AtomicOperations_unsignedint.cpp | 46 - ...penMP_AtomicOperations_unsignedlongint.cpp | 46 - .../openmp/TestOpenMP_AtomicViews.cpp | 47 - .../unit_test/openmp/TestOpenMP_Atomics.cpp | 46 - .../unit_test/openmp/TestOpenMP_Category.hpp | 1 + .../unit_test/openmp/TestOpenMP_Complex.cpp | 47 - .../unit_test/openmp/TestOpenMP_Concepts.cpp | 46 - .../core/unit_test/openmp/TestOpenMP_Crs.cpp | 46 - .../openmp/TestOpenMP_DeepCopyAlignment.cpp | 46 - .../openmp/TestOpenMP_FunctorAnalysis.cpp | 47 - ...P_RangePolicy.cpp => TestOpenMP_Graph.cpp} | 2 +- .../core/unit_test/openmp/TestOpenMP_Init.cpp | 49 - .../openmp/TestOpenMP_LocalDeepCopy.cpp | 46 - .../unit_test/openmp/TestOpenMP_MDRange_a.cpp | 47 - .../unit_test/openmp/TestOpenMP_MDRange_b.cpp | 47 - .../unit_test/openmp/TestOpenMP_MDRange_c.cpp | 47 - .../unit_test/openmp/TestOpenMP_MDRange_d.cpp | 47 - .../unit_test/openmp/TestOpenMP_MDRange_e.cpp | 47 - ...her.cpp => TestOpenMP_PartitionMaster.cpp} | 8 +- .../openmp/TestOpenMP_RangePolicyRequire.cpp | 47 - .../openmp/TestOpenMP_Reducers_a.cpp | 46 - .../openmp/TestOpenMP_Reducers_b.cpp | 46 - .../openmp/TestOpenMP_Reducers_c.cpp | 46 - .../openmp/TestOpenMP_Reducers_d.cpp | 46 - .../openmp/TestOpenMP_Reductions.cpp | 47 - .../TestOpenMP_Reductions_DeviceView.cpp | 46 - .../core/unit_test/openmp/TestOpenMP_Scan.cpp | 47 - .../openmp/TestOpenMP_SharedAlloc.cpp | 54 - .../unit_test/openmp/TestOpenMP_SubView_a.cpp | 102 - .../unit_test/openmp/TestOpenMP_SubView_b.cpp | 66 - .../openmp/TestOpenMP_SubView_c01.cpp | 54 - .../openmp/TestOpenMP_SubView_c02.cpp | 55 - .../openmp/TestOpenMP_SubView_c03.cpp | 55 - .../openmp/TestOpenMP_SubView_c04.cpp | 54 - .../openmp/TestOpenMP_SubView_c05.cpp | 55 - .../openmp/TestOpenMP_SubView_c06.cpp | 55 - .../openmp/TestOpenMP_SubView_c07.cpp | 54 - .../openmp/TestOpenMP_SubView_c08.cpp | 55 - .../openmp/TestOpenMP_SubView_c09.cpp | 55 - .../openmp/TestOpenMP_SubView_c10.cpp | 54 - .../openmp/TestOpenMP_SubView_c11.cpp | 55 - .../openmp/TestOpenMP_SubView_c12.cpp | 55 - .../openmp/TestOpenMP_SubView_c13.cpp | 54 - .../openmp/TestOpenMP_SubView_c_all.cpp | 13 - .../core/unit_test/openmp/TestOpenMP_Team.cpp | 105 - .../openmp/TestOpenMP_TeamReductionScan.cpp | 81 - .../openmp/TestOpenMP_TeamScratch.cpp | 79 - .../openmp/TestOpenMP_TeamTeamSize.cpp | 46 - .../openmp/TestOpenMP_TeamVectorRange.cpp | 47 - .../openmp/TestOpenMP_UniqueToken.cpp | 46 - .../unit_test/openmp/TestOpenMP_ViewAPI_a.cpp | 46 - .../unit_test/openmp/TestOpenMP_ViewAPI_b.cpp | 46 - .../unit_test/openmp/TestOpenMP_ViewAPI_c.cpp | 46 - .../unit_test/openmp/TestOpenMP_ViewAPI_d.cpp | 46 - .../unit_test/openmp/TestOpenMP_ViewAPI_e.cpp | 47 - .../TestOpenMP_ViewLayoutStrideAssignment.cpp | 47 - .../openmp/TestOpenMP_ViewMapping_a.cpp | 46 - .../openmp/TestOpenMP_ViewMapping_b.cpp | 46 - .../openmp/TestOpenMP_ViewMapping_subview.cpp | 46 - .../openmp/TestOpenMP_ViewOfClass.cpp | 46 - .../openmp/TestOpenMP_ViewResize.cpp | 46 - .../openmp/TestOpenMP_View_64bit.cpp | 46 - .../unit_test/openmp/TestOpenMP_WorkGraph.cpp | 46 - .../openmptarget/TestOpenMPTarget.hpp | 1 - ...PTarget_AtomicOperations_complexdouble.cpp | 46 - ...MPTarget_AtomicOperations_complexfloat.cpp | 46 - ...stOpenMPTarget_AtomicOperations_double.cpp | 46 - ...estOpenMPTarget_AtomicOperations_float.cpp | 46 - .../TestOpenMPTarget_AtomicOperations_int.cpp | 46 - ...tOpenMPTarget_AtomicOperations_longint.cpp | 46 - ...nMPTarget_AtomicOperations_longlongint.cpp | 46 - ...nMPTarget_AtomicOperations_unsignedint.cpp | 46 - ...arget_AtomicOperations_unsignedlongint.cpp | 46 - .../TestOpenMPTarget_AtomicViews.cpp | 47 - .../openmptarget/TestOpenMPTarget_Atomics.cpp | 46 - .../openmptarget/TestOpenMPTarget_Complex.cpp | 47 - .../TestOpenMPTarget_Concepts.cpp | 46 - .../openmptarget/TestOpenMPTarget_Crs.cpp | 0 .../TestOpenMPTarget_DeepCopyAlignment.cpp | 46 - .../TestOpenMPTarget_FunctorAnalysis.cpp | 0 .../openmptarget/TestOpenMPTarget_Init.cpp | 49 - .../TestOpenMPTarget_LocalDeepCopy.cpp | 0 .../TestOpenMPTarget_MDRange_a.cpp | 47 - .../TestOpenMPTarget_MDRange_b.cpp | 47 - .../TestOpenMPTarget_MDRange_c.cpp | 47 - .../TestOpenMPTarget_MDRange_d.cpp | 47 - .../TestOpenMPTarget_MDRange_e.cpp | 47 - .../openmptarget/TestOpenMPTarget_Other.cpp | 50 - .../TestOpenMPTarget_RangePolicy.cpp | 47 - .../TestOpenMPTarget_RangePolicyRequire.cpp | 0 .../TestOpenMPTarget_Reducers_a.cpp | 46 - .../TestOpenMPTarget_Reducers_b.cpp | 46 - .../TestOpenMPTarget_Reducers_c.cpp | 46 - .../TestOpenMPTarget_Reducers_d.cpp | 46 - .../TestOpenMPTarget_Reductions.cpp | 48 - ...TestOpenMPTarget_Reductions_DeviceView.cpp | 0 .../openmptarget/TestOpenMPTarget_Scan.cpp | 47 - .../TestOpenMPTarget_SharedAlloc.cpp | 55 - .../TestOpenMPTarget_SubView_a.cpp | 102 - .../TestOpenMPTarget_SubView_b.cpp | 66 - .../TestOpenMPTarget_SubView_c01.cpp | 54 - .../TestOpenMPTarget_SubView_c02.cpp | 55 - .../TestOpenMPTarget_SubView_c03.cpp | 55 - .../TestOpenMPTarget_SubView_c04.cpp | 54 - .../TestOpenMPTarget_SubView_c05.cpp | 55 - .../TestOpenMPTarget_SubView_c06.cpp | 55 - .../TestOpenMPTarget_SubView_c07.cpp | 54 - .../TestOpenMPTarget_SubView_c08.cpp | 55 - .../TestOpenMPTarget_SubView_c09.cpp | 55 - .../TestOpenMPTarget_SubView_c10.cpp | 54 - .../TestOpenMPTarget_SubView_c11.cpp | 55 - .../TestOpenMPTarget_SubView_c12.cpp | 55 - .../TestOpenMPTarget_SubView_c13.cpp | 0 .../openmptarget/TestOpenMPTarget_Team.cpp | 83 - .../TestOpenMPTarget_TeamReductionScan.cpp | 83 - .../TestOpenMPTarget_TeamScratch.cpp | 78 - .../TestOpenMPTarget_TeamTeamSize.cpp | 0 .../TestOpenMPTarget_TeamVectorRange.cpp | 0 .../TestOpenMPTarget_UniqueToken.cpp | 0 .../TestOpenMPTarget_ViewAPI_a.cpp | 46 - .../TestOpenMPTarget_ViewAPI_b.cpp | 46 - .../TestOpenMPTarget_ViewAPI_c.cpp | 46 - .../TestOpenMPTarget_ViewAPI_d.cpp | 46 - .../TestOpenMPTarget_ViewAPI_e.cpp | 46 - ...penMPTarget_ViewLayoutStrideAssignment.cpp | 47 - .../TestOpenMPTarget_ViewMapping_a.cpp | 46 - .../TestOpenMPTarget_ViewMapping_b.cpp | 46 - .../TestOpenMPTarget_ViewMapping_subview.cpp | 46 - .../TestOpenMPTarget_ViewOfClass.cpp | 46 - .../TestOpenMPTarget_ViewResize.cpp | 0 .../TestOpenMPTarget_View_64bit.cpp | 0 .../TestOpenMPTarget_WorkGraph.cpp | 0 .../rocm/TestROCmHostPinned_SharedAlloc.cpp | 54 - .../rocm/TestROCmHostPinned_ViewAPI_a.cpp | 46 - .../rocm/TestROCmHostPinned_ViewAPI_b.cpp | 46 - .../rocm/TestROCmHostPinned_ViewAPI_c.cpp | 46 - .../rocm/TestROCmHostPinned_ViewAPI_d.cpp | 46 - .../rocm/TestROCmHostPinned_ViewAPI_e.cpp | 46 - .../rocm/TestROCmHostPinned_ViewCopy.cpp | 46 - .../rocm/TestROCmHostPinned_ViewMapping_a.cpp | 46 - .../rocm/TestROCmHostPinned_ViewMapping_b.cpp | 46 - ...TestROCmHostPinned_ViewMapping_subview.cpp | 46 - .../rocm/TestROCmHostPinned_View_64bit.cpp | 46 - .../core/unit_test/rocm/TestROCm_All.cpp | 33 - .../rocm/TestROCm_AtomicOperations_double.cpp | 46 - .../rocm/TestROCm_AtomicOperations_float.cpp | 46 - .../rocm/TestROCm_AtomicOperations_int.cpp | 46 - .../TestROCm_AtomicOperations_longint.cpp | 46 - .../TestROCm_AtomicOperations_longlongint.cpp | 46 - .../TestROCm_AtomicOperations_unsignedint.cpp | 46 - ...tROCm_AtomicOperations_unsignedlongint.cpp | 46 - .../unit_test/rocm/TestROCm_AtomicViews.cpp | 47 - .../core/unit_test/rocm/TestROCm_Atomics.cpp | 46 - .../core/unit_test/rocm/TestROCm_Complex.cpp | 47 - .../core/unit_test/rocm/TestROCm_Crs.cpp | 47 - .../rocm/TestROCm_DeepCopyAlignment.cpp | 46 - .../core/unit_test/rocm/TestROCm_Init.cpp | 49 - .../rocm/TestROCm_MDRangeReduce_a.cpp | 54 - .../rocm/TestROCm_MDRangeReduce_b.cpp | 54 - .../rocm/TestROCm_MDRangeReduce_d.cpp | 54 - .../rocm/TestROCm_MDRangeReduce_e.cpp | 54 - .../unit_test/rocm/TestROCm_MDRange_a.cpp | 47 - .../unit_test/rocm/TestROCm_MDRange_b.cpp | 47 - .../unit_test/rocm/TestROCm_MDRange_c.cpp | 47 - .../unit_test/rocm/TestROCm_MDRange_d.cpp | 47 - .../unit_test/rocm/TestROCm_MDRange_e.cpp | 47 - .../core/unit_test/rocm/TestROCm_Other.cpp | 52 - .../unit_test/rocm/TestROCm_RangePolicy.cpp | 47 - .../unit_test/rocm/TestROCm_Reducers_a.cpp | 46 - .../unit_test/rocm/TestROCm_Reducers_b.cpp | 46 - .../unit_test/rocm/TestROCm_Reducers_c.cpp | 46 - .../unit_test/rocm/TestROCm_Reducers_d.cpp | 46 - .../unit_test/rocm/TestROCm_Reductions.cpp | 47 - .../core/unit_test/rocm/TestROCm_Scan.cpp | 47 - .../unit_test/rocm/TestROCm_SharedAlloc.cpp | 55 - .../core/unit_test/rocm/TestROCm_Spaces.cpp | 237 -- .../unit_test/rocm/TestROCm_SubView_a.cpp | 102 - .../unit_test/rocm/TestROCm_SubView_c01.cpp | 54 - .../unit_test/rocm/TestROCm_SubView_c02.cpp | 55 - .../unit_test/rocm/TestROCm_SubView_c03.cpp | 55 - .../unit_test/rocm/TestROCm_SubView_c04.cpp | 54 - .../unit_test/rocm/TestROCm_SubView_c05.cpp | 55 - .../unit_test/rocm/TestROCm_SubView_c06.cpp | 55 - .../unit_test/rocm/TestROCm_SubView_c07.cpp | 54 - .../unit_test/rocm/TestROCm_SubView_c08.cpp | 55 - .../unit_test/rocm/TestROCm_SubView_c09.cpp | 55 - .../unit_test/rocm/TestROCm_SubView_c10.cpp | 54 - .../unit_test/rocm/TestROCm_SubView_c11.cpp | 55 - .../unit_test/rocm/TestROCm_SubView_c12.cpp | 55 - .../unit_test/rocm/TestROCm_SubView_c13.cpp | 54 - .../core/unit_test/rocm/TestROCm_Team.cpp | 83 - .../rocm/TestROCm_TeamReductionScan.cpp | 81 - .../unit_test/rocm/TestROCm_TeamTeamSize.cpp | 50 - .../unit_test/rocm/TestROCm_ViewAPI_a.cpp | 46 - .../unit_test/rocm/TestROCm_ViewAPI_b.cpp | 46 - .../unit_test/rocm/TestROCm_ViewAPI_c.cpp | 46 - .../unit_test/rocm/TestROCm_ViewAPI_d.cpp | 46 - .../unit_test/rocm/TestROCm_ViewAPI_e.cpp | 46 - .../unit_test/rocm/TestROCm_ViewMapping_a.cpp | 46 - .../unit_test/rocm/TestROCm_ViewMapping_b.cpp | 46 - .../rocm/TestROCm_ViewMapping_subview.cpp | 46 - .../unit_test/rocm/TestROCm_ViewOfClass.cpp | 46 - ...tSerial_AtomicOperations_complexdouble.cpp | 46 - ...stSerial_AtomicOperations_complexfloat.cpp | 46 - .../TestSerial_AtomicOperations_double.cpp | 46 - .../TestSerial_AtomicOperations_float.cpp | 46 - .../TestSerial_AtomicOperations_int.cpp | 46 - .../TestSerial_AtomicOperations_longint.cpp | 46 - ...estSerial_AtomicOperations_longlongint.cpp | 46 - ...estSerial_AtomicOperations_unsignedint.cpp | 46 - ...erial_AtomicOperations_unsignedlongint.cpp | 46 - .../serial/TestSerial_AtomicViews.cpp | 47 - .../unit_test/serial/TestSerial_Atomics.cpp | 46 - .../unit_test/serial/TestSerial_Category.hpp | 1 + .../unit_test/serial/TestSerial_Complex.cpp | 47 - .../unit_test/serial/TestSerial_Concepts.cpp | 46 - .../core/unit_test/serial/TestSerial_Crs.cpp | 46 - .../serial/TestSerial_DeepCopyAlignment.cpp | 46 - .../serial/TestSerial_FunctorAnalysis.cpp | 47 - ...l_RangePolicy.cpp => TestSerial_Graph.cpp} | 2 +- .../core/unit_test/serial/TestSerial_Init.cpp | 49 - .../serial/TestSerial_LocalDeepCopy.cpp | 46 - .../unit_test/serial/TestSerial_MDRange_a.cpp | 47 - .../unit_test/serial/TestSerial_MDRange_b.cpp | 47 - .../unit_test/serial/TestSerial_MDRange_c.cpp | 47 - .../unit_test/serial/TestSerial_MDRange_d.cpp | 47 - .../unit_test/serial/TestSerial_MDRange_e.cpp | 47 - .../unit_test/serial/TestSerial_Other.cpp | 53 - .../serial/TestSerial_RangePolicyRequire.cpp | 47 - .../serial/TestSerial_Reducers_a.cpp | 46 - .../serial/TestSerial_Reducers_b.cpp | 46 - .../serial/TestSerial_Reducers_c.cpp | 46 - .../serial/TestSerial_Reducers_d.cpp | 46 - .../serial/TestSerial_Reductions.cpp | 47 - .../TestSerial_Reductions_DeviceView.cpp | 46 - .../core/unit_test/serial/TestSerial_Scan.cpp | 47 - .../serial/TestSerial_SharedAlloc.cpp | 54 - .../unit_test/serial/TestSerial_SubView_a.cpp | 102 - .../unit_test/serial/TestSerial_SubView_b.cpp | 66 - .../serial/TestSerial_SubView_c01.cpp | 54 - .../serial/TestSerial_SubView_c02.cpp | 55 - .../serial/TestSerial_SubView_c03.cpp | 55 - .../serial/TestSerial_SubView_c04.cpp | 54 - .../serial/TestSerial_SubView_c05.cpp | 55 - .../serial/TestSerial_SubView_c06.cpp | 55 - .../serial/TestSerial_SubView_c07.cpp | 54 - .../serial/TestSerial_SubView_c08.cpp | 55 - .../serial/TestSerial_SubView_c09.cpp | 55 - .../serial/TestSerial_SubView_c10.cpp | 54 - .../serial/TestSerial_SubView_c11.cpp | 55 - .../serial/TestSerial_SubView_c12.cpp | 55 - .../serial/TestSerial_SubView_c13.cpp | 54 - .../serial/TestSerial_SubView_c_all.cpp | 13 - .../core/unit_test/serial/TestSerial_Team.cpp | 166 - .../serial/TestSerial_TeamReductionScan.cpp | 81 - .../serial/TestSerial_TeamScratch.cpp | 80 - .../serial/TestSerial_TeamTeamSize.cpp | 46 - .../serial/TestSerial_TeamVectorRange.cpp | 47 - .../serial/TestSerial_UniqueToken.cpp | 46 - .../unit_test/serial/TestSerial_ViewAPI_a.cpp | 46 - .../unit_test/serial/TestSerial_ViewAPI_b.cpp | 46 - .../unit_test/serial/TestSerial_ViewAPI_c.cpp | 46 - .../unit_test/serial/TestSerial_ViewAPI_d.cpp | 46 - .../unit_test/serial/TestSerial_ViewAPI_e.cpp | 47 - .../TestSerial_ViewLayoutStrideAssignment.cpp | 47 - .../serial/TestSerial_ViewMapping_a.cpp | 46 - .../serial/TestSerial_ViewMapping_b.cpp | 46 - .../serial/TestSerial_ViewMapping_subview.cpp | 46 - .../serial/TestSerial_ViewOfClass.cpp | 46 - .../serial/TestSerial_ViewResize.cpp | 46 - .../serial/TestSerial_View_64bit.cpp | 46 - .../unit_test/serial/TestSerial_WorkGraph.cpp | 46 - .../unit_test/standalone/UnitTestMainInit.cpp | 3 - ...Threads_AtomicOperations_complexdouble.cpp | 46 - ...tThreads_AtomicOperations_complexfloat.cpp | 46 - .../TestThreads_AtomicOperations_double.cpp | 46 - .../TestThreads_AtomicOperations_float.cpp | 46 - .../TestThreads_AtomicOperations_int.cpp | 46 - .../TestThreads_AtomicOperations_longint.cpp | 46 - ...stThreads_AtomicOperations_longlongint.cpp | 46 - ...stThreads_AtomicOperations_unsignedint.cpp | 46 - ...reads_AtomicOperations_unsignedlongint.cpp | 46 - .../threads/TestThreads_AtomicViews.cpp | 47 - .../unit_test/threads/TestThreads_Atomics.cpp | 46 - .../unit_test/threads/TestThreads_Complex.cpp | 47 - .../threads/TestThreads_Concepts.cpp | 46 - .../unit_test/threads/TestThreads_Crs.cpp | 46 - .../threads/TestThreads_DeepCopyAlignment.cpp | 46 - .../threads/TestThreads_FunctorAnalysis.cpp | 47 - .../unit_test/threads/TestThreads_Init.cpp | 49 - .../threads/TestThreads_LocalDeepCopy.cpp | 46 - .../threads/TestThreads_MDRange_a.cpp | 47 - .../threads/TestThreads_MDRange_b.cpp | 47 - .../threads/TestThreads_MDRange_c.cpp | 47 - .../threads/TestThreads_MDRange_d.cpp | 47 - .../threads/TestThreads_MDRange_e.cpp | 47 - .../unit_test/threads/TestThreads_Other.cpp | 53 - .../threads/TestThreads_RangePolicy.cpp | 47 - .../TestThreads_RangePolicyRequire.cpp | 47 - .../threads/TestThreads_Reducers_a.cpp | 46 - .../threads/TestThreads_Reducers_b.cpp | 46 - .../threads/TestThreads_Reducers_c.cpp | 46 - .../threads/TestThreads_Reducers_d.cpp | 46 - .../threads/TestThreads_Reductions.cpp | 47 - .../TestThreads_Reductions_DeviceView.cpp | 46 - .../unit_test/threads/TestThreads_Scan.cpp | 47 - .../threads/TestThreads_SharedAlloc.cpp | 54 - .../threads/TestThreads_SubView_a.cpp | 102 - .../threads/TestThreads_SubView_b.cpp | 66 - .../threads/TestThreads_SubView_c01.cpp | 54 - .../threads/TestThreads_SubView_c02.cpp | 55 - .../threads/TestThreads_SubView_c03.cpp | 55 - .../threads/TestThreads_SubView_c04.cpp | 54 - .../threads/TestThreads_SubView_c05.cpp | 55 - .../threads/TestThreads_SubView_c06.cpp | 55 - .../threads/TestThreads_SubView_c07.cpp | 54 - .../threads/TestThreads_SubView_c08.cpp | 55 - .../threads/TestThreads_SubView_c09.cpp | 55 - .../threads/TestThreads_SubView_c10.cpp | 54 - .../threads/TestThreads_SubView_c11.cpp | 55 - .../threads/TestThreads_SubView_c12.cpp | 55 - .../threads/TestThreads_SubView_c13.cpp | 54 - .../unit_test/threads/TestThreads_Team.cpp | 105 - .../threads/TestThreads_TeamReductionScan.cpp | 81 - .../threads/TestThreads_TeamTeamSize.cpp | 46 - .../threads/TestThreads_TeamVectorRange.cpp | 47 - .../threads/TestThreads_UniqueToken.cpp | 46 - .../threads/TestThreads_ViewAPI_a.cpp | 46 - .../threads/TestThreads_ViewAPI_b.cpp | 46 - .../threads/TestThreads_ViewAPI_c.cpp | 46 - .../threads/TestThreads_ViewAPI_d.cpp | 46 - .../threads/TestThreads_ViewAPI_e.cpp | 47 - ...TestThreads_ViewLayoutStrideAssignment.cpp | 47 - .../threads/TestThreads_ViewMapping_a.cpp | 46 - .../threads/TestThreads_ViewMapping_b.cpp | 46 - .../TestThreads_ViewMapping_subview.cpp | 46 - .../threads/TestThreads_ViewOfClass.cpp | 46 - .../threads/TestThreads_ViewResize.cpp | 46 - .../threads/TestThreads_View_64bit.cpp | 46 - .../threads/TestThreads_WorkGraph.cpp | 46 - .../TestLogicalSpaces.cpp} | 8 +- .../unit_test/tools/TestLogicalSpaces.hpp | 198 ++ .../core/unit_test/tools/TestTuning.cpp | 2 - .../tutorial/02_simple_reduce/Makefile | 11 +- .../tutorial/02_simple_reduce_lambda/Makefile | 11 +- .../example/tutorial/03_simple_view/Makefile | 11 +- .../03_simple_view_lambda/CMakeLists.txt | 13 +- .../tutorial/03_simple_view_lambda/Makefile | 11 +- .../tutorial/04_simple_memoryspaces/Makefile | 11 +- .../tutorial/05_simple_atomics/Makefile | 11 +- .../Advanced_Views/01_data_layouts/Makefile | 11 +- .../Advanced_Views/02_memory_traits/Makefile | 11 +- .../Advanced_Views/03_subviews/Makefile | 11 +- .../Advanced_Views/04_dualviews/Makefile | 11 +- .../Advanced_Views/06_AtomicViews/Makefile | 11 +- .../Algorithms/01_random_numbers/Makefile | 11 +- .../01_thread_teams/Makefile | 11 +- .../01_thread_teams_lambda/Makefile | 11 +- .../02_nested_parallel_for/Makefile | 11 +- .../03_vectorization/Makefile | 11 +- .../04_team_scan/Makefile | 11 +- .../example/tutorial/launch_bounds/Makefile | 11 +- lib/kokkos/generate_makefile.bash | 141 +- lib/kokkos/gnu_generate_makefile.bash | 25 +- lib/kokkos/master_history.txt | 1 + 927 files changed, 18603 insertions(+), 46876 deletions(-) rename lib/kokkos/benchmarks/gups/{gups-kokkos.cc => gups-kokkos.cpp} (100%) rename lib/kokkos/benchmarks/stream/{stream-kokkos.cc => stream-kokkos.cpp} (100%) create mode 100755 lib/kokkos/bin/kokkos_launch_compiler rename lib/kokkos/{core/unit_test/cuda/TestCuda_AtomicViews.cpp => cmake/KokkosCore_Config_HeaderSet.in} (96%) rename lib/kokkos/containers/performance_tests/{TestROCm.cpp => TestHIP.cpp} (67%) create mode 100644 lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp create mode 100644 lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp delete mode 100644 lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp delete mode 100644 lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp create mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp rename lib/kokkos/core/src/{ROCm/Kokkos_ROCm_Invoke.hpp => Cuda/Kokkos_Cuda_GraphNode_Impl.hpp} (52%) create mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp create mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp create mode 100644 lib/kokkos/core/src/Kokkos_Graph.hpp create mode 100644 lib/kokkos/core/src/Kokkos_GraphNode.hpp rename lib/kokkos/core/{unit_test/hpx/TestHPX_SubView_c09.cpp => src/Kokkos_Graph_fwd.hpp} (81%) create mode 100644 lib/kokkos/core/src/Kokkos_Half.hpp create mode 100644 lib/kokkos/core/src/Kokkos_LogicalSpaces.hpp delete mode 100644 lib/kokkos/core/src/Kokkos_ROCmSpace.hpp rename lib/kokkos/core/src/{Kokkos_ROCm.hpp => Kokkos_SYCL.hpp} (50%) create mode 100644 lib/kokkos/core/src/Kokkos_SYCL_Space.hpp create mode 100644 lib/kokkos/core/src/Kokkos_Tuners.hpp rename lib/kokkos/core/{unit_test/hpx/TestHPX_SubView_c12.cpp => src/OpenMPTarget/Kokkos_OpenMPTarget_Abort.hpp} (85%) delete mode 100644 lib/kokkos/core/src/ROCm/KokkosExp_ROCm_IterateTile_Refactor.hpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_ReduceScan.hpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Space.cpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.cpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.hpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Tile.hpp delete mode 100644 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Vectorization.hpp delete mode 100644 lib/kokkos/core/src/ROCm/hc_math_std.hpp create mode 100644 lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp create mode 100644 lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp create mode 100644 lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp rename lib/kokkos/core/{unit_test/cuda/TestCuda_TeamScratch.cpp => src/SYCL/Kokkos_SYCL_Instance.hpp} (55%) create mode 100644 lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp create mode 100644 lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp create mode 100644 lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp create mode 100644 lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp create mode 100644 lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp create mode 100644 lib/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp rename lib/kokkos/core/{unit_test/cuda/TestCuda_Init.cpp => src/decl/Kokkos_Declare_HIP.hpp} (93%) create mode 100644 lib/kokkos/core/src/decl/Kokkos_Declare_HPX.hpp create mode 100644 lib/kokkos/core/src/decl/Kokkos_Declare_OPENMP.hpp rename lib/kokkos/core/{unit_test/hpx/TestHPX_SharedAlloc.cpp => src/decl/Kokkos_Declare_OPENMPTARGET.hpp} (90%) create mode 100644 lib/kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp create mode 100644 lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp create mode 100644 lib/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp rename lib/kokkos/core/src/{Cuda/Kokkos_Cuda_fwd.hpp => fwd/Kokkos_Fwd_CUDA.hpp} (100%) create mode 100644 lib/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp create mode 100644 lib/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp create mode 100644 lib/kokkos/core/src/fwd/Kokkos_Fwd_HPX.hpp rename lib/kokkos/core/src/{ROCm/Kokkos_ROCm_Config.hpp => fwd/Kokkos_Fwd_OPENMP.hpp} (91%) create mode 100644 lib/kokkos/core/src/fwd/Kokkos_Fwd_OPENMPTARGET.hpp create mode 100644 lib/kokkos/core/src/fwd/Kokkos_Fwd_SERIAL.hpp create mode 100644 lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp create mode 100644 lib/kokkos/core/src/fwd/Kokkos_Fwd_THREADS.hpp rename lib/kokkos/core/src/{HIP/KokkosExp_HIP_IterateTile.hpp => impl/KokkosExp_IterateTileGPU.hpp} (95%) create mode 100644 lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp rename lib/kokkos/core/{unit_test/cuda/TestCuda_SubView_c09.cpp => src/impl/Kokkos_Default_Graph_fwd.hpp} (83%) rename lib/kokkos/core/src/{ROCm/Kokkos_ROCm_Join.hpp => impl/Kokkos_ExecSpaceInitializer.hpp} (77%) create mode 100644 lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp rename lib/kokkos/core/{unit_test/rocm/TestROCm_SubView_b.cpp => src/impl/Kokkos_GraphImpl_fwd.hpp} (64%) rename lib/kokkos/{algorithms/unit_tests/TestROCm.cpp => core/src/impl/Kokkos_GraphNodeCustomization.hpp} (51%) create mode 100644 lib/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp create mode 100644 lib/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp create mode 100644 lib/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp rename lib/kokkos/core/unit_test/{TestAtomic.hpp => TestAtomics.hpp} (100%) rename lib/kokkos/core/{src/HIP/Kokkos_HIP_KernelLaunch.cpp => unit_test/TestBlockSizeDeduction.hpp} (69%) create mode 100644 lib/kokkos/core/unit_test/TestCudaUVM_Category.hpp rename lib/kokkos/core/unit_test/{cuda/TestCuda_SubView_c01.cpp => TestCuda_Category.hpp} (89%) rename lib/kokkos/core/unit_test/{TestDeepCopy.hpp => TestDeepCopyAlignment.hpp} (100%) create mode 100644 lib/kokkos/core/unit_test/TestGraph.hpp rename lib/kokkos/core/unit_test/{rocm/TestROCm_Category.hpp => TestHIPHostPinned_Category.hpp} (91%) create mode 100644 lib/kokkos/core/unit_test/TestHIP_Category.hpp rename lib/kokkos/core/unit_test/{rocm/TestROCmHostPinned_Category.hpp => TestHPX_Category.hpp} (89%) rename lib/kokkos/core/unit_test/{rocm/TestROCm_TeamScratch.cpp => TestHalfConversion.hpp} (56%) create mode 100644 lib/kokkos/core/unit_test/TestHalfOperators.hpp rename lib/kokkos/core/unit_test/{rocm/TestROCm_MDRangeReduce_c.cpp => TestMDRange_f.hpp} (93%) create mode 100644 lib/kokkos/core/unit_test/TestOpenMPTarget_Category.hpp create mode 100644 lib/kokkos/core/unit_test/TestOpenMP_Category.hpp rename lib/kokkos/core/unit_test/{cuda/TestCuda_Other.cpp => TestOther.hpp} (97%) rename lib/kokkos/core/unit_test/{hip/TestHIP_RangePolicy.cpp => TestRangePolicy.hpp} (96%) rename lib/kokkos/core/unit_test/{TestRangeRequire.hpp => TestRangePolicyRequire.hpp} (96%) rename lib/kokkos/core/unit_test/{cuda/TestCuda_Reductions.cpp => TestReductions.hpp} (93%) rename lib/kokkos/core/unit_test/{TestReduceDeviceView.hpp => TestReductions_DeviceView.hpp} (100%) create mode 100644 lib/kokkos/core/unit_test/TestSYCL_Category.hpp create mode 100644 lib/kokkos/core/unit_test/TestSerial_Category.hpp rename lib/kokkos/core/unit_test/{hip/TestHIP_SubView_a.cpp => TestSubView_a.hpp} (97%) rename lib/kokkos/core/unit_test/{cuda/TestCuda_SubView_b.cpp => TestSubView_b.hpp} (97%) rename lib/kokkos/core/unit_test/{hip/TestHIP_SubView_c01.cpp => TestSubView_c01.hpp} (96%) rename lib/kokkos/core/unit_test/{cuda/TestCuda_SubView_c02.cpp => TestSubView_c02.hpp} (96%) rename lib/kokkos/core/unit_test/{hip/TestHIP_SubView_c03.cpp => TestSubView_c03.hpp} (96%) rename lib/kokkos/core/unit_test/{cuda/TestCuda_SubView_c04.cpp => TestSubView_c04.hpp} (96%) rename lib/kokkos/core/unit_test/{cuda/TestCuda_SubView_c05.cpp => TestSubView_c05.hpp} (96%) rename lib/kokkos/core/unit_test/{cuda/TestCuda_SubView_c06.cpp => TestSubView_c06.hpp} (96%) rename lib/kokkos/core/unit_test/{cuda/TestCuda_SubView_c07.cpp => TestSubView_c07.hpp} (96%) rename lib/kokkos/core/unit_test/{cuda/TestCuda_SubView_c08.cpp => TestSubView_c08.hpp} (96%) rename lib/kokkos/core/unit_test/{hip/TestHIP_SubView_c09.cpp => TestSubView_c09.hpp} (96%) rename lib/kokkos/core/unit_test/{hip/TestHIP_SubView_c10.cpp => TestSubView_c10.hpp} (96%) rename lib/kokkos/core/unit_test/{hpx/TestHPX_SubView_c11.cpp => TestSubView_c11.hpp} (96%) rename lib/kokkos/core/unit_test/{hip/TestHIP_SubView_c12.cpp => TestSubView_c12.hpp} (96%) rename lib/kokkos/core/unit_test/{cuda/TestCuda_SubView_c13.cpp => TestSubView_c13.hpp} (96%) create mode 100644 lib/kokkos/core/unit_test/TestTeamBasic.hpp rename lib/kokkos/core/unit_test/{cuda/TestCuda_TeamReductionScan.cpp => TestTeamReductionScan.hpp} (58%) create mode 100644 lib/kokkos/core/unit_test/TestTeamScan.hpp rename lib/kokkos/core/unit_test/{threads/TestThreads_TeamScratch.cpp => TestTeamScratch.hpp} (74%) create mode 100644 lib/kokkos/core/unit_test/TestThreads_Category.hpp rename lib/kokkos/core/unit_test/{TestViewCopy.hpp => TestViewCopy_a.hpp} (65%) create mode 100644 lib/kokkos/core/unit_test/TestViewCopy_b.hpp create mode 100644 lib/kokkos/core/unit_test/UnitTest_CMakePassCmdLineArgs.cpp rename lib/kokkos/core/unit_test/cuda/{TestCudaHostPinned_ViewCopy.cpp => TestCudaHostPinned_ViewCopy_a.cpp} (98%) rename lib/kokkos/core/unit_test/cuda/{TestCuda_AtomicOperations_int.cpp => TestCudaHostPinned_ViewCopy_b.cpp} (96%) rename lib/kokkos/core/unit_test/cuda/{TestCudaUVM_ViewCopy.cpp => TestCudaUVM_ViewCopy_a.cpp} (98%) rename lib/kokkos/core/unit_test/cuda/{TestCuda_Complex.cpp => TestCudaUVM_ViewCopy_b.cpp} (96%) delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations_complexdouble.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations_complexfloat.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations_double.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations_float.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations_longint.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations_longlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations_unsignedint.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations_unsignedlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Concepts.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Crs.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_DeepCopyAlignment.cpp rename lib/kokkos/core/unit_test/cuda/{TestCuda_RangePolicy.cpp => TestCuda_Graph.cpp} (98%) delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_LocalDeepCopy.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_MDRange_a.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_MDRange_b.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_MDRange_c.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_MDRange_d.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_MDRange_e.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_RangePolicyRequire.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Reducers_a.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Reducers_b.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Reducers_c.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Reducers_d.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_DeviceView.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Scan.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SharedAlloc.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Team.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_TeamScratchStreams.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_TeamTeamSize.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_TeamVectorRange.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_a.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_c.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_d.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_e.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewLayoutStrideAssignment.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_a.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_b.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_subview.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewOfClass.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewResize.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_View_64bit.cpp delete mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp create mode 100644 lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp create mode 100644 lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIPHostPinned_SharedAlloc.cpp rename lib/kokkos/core/unit_test/hip/{TestHIPHostPinned_ViewCopy.cpp => TestHIPHostPinned_ViewCopy_a.cpp} (98%) rename lib/kokkos/core/unit_test/{cuda/TestCuda_Atomics.cpp => hip/TestHIPHostPinned_ViewCopy_b.cpp} (96%) delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_AtomicOperations_complexdouble.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_AtomicOperations_complexfloat.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_AtomicOperations_double.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_AtomicOperations_float.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_AtomicOperations_int.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_AtomicOperations_longint.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_AtomicOperations_longlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_AtomicOperations_unsignedint.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_AtomicOperations_unsignedlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_AtomicViews.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Atomics.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Complex.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Concepts.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Crs.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_DeepCopyAlignment.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_FunctorAnalysis.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Init.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_LocalDeepCopy.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_MDRange_a.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_MDRange_b.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_MDRange_c.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_MDRange_d.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_MDRange_e.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Other.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_RangePolicyRequire.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Reducers_a.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Reducers_b.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Reducers_c.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Reducers_d.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Reductions.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Reductions_DeviceView.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Scan.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_SharedAlloc.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_SubView_b.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_SubView_c02.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_SubView_c04.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_SubView_c05.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_SubView_c06.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_SubView_c07.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_SubView_c08.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_SubView_c11.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_SubView_c13.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_Team.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_TeamReductionScan.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_TeamScratch.cpp create mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_UniqueToken.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewAPI_a.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewAPI_b.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewAPI_c.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewAPI_d.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewAPI_e.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewLayoutStrideAssignment.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewMapping_a.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewMapping_b.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewMapping_subview.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewOfClass.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_ViewResize.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_View_64bit.cpp delete mode 100644 lib/kokkos/core/unit_test/hip/TestHIP_WorkGraph.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_AtomicOperations_complexdouble.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_AtomicOperations_complexfloat.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_AtomicOperations_double.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_AtomicOperations_float.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_AtomicOperations_int.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_AtomicOperations_longint.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_AtomicOperations_longlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_AtomicOperations_unsignedint.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_AtomicOperations_unsignedlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_AtomicViews.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Atomics.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Complex.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Concepts.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Crs.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_DeepCopyAlignment.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_FunctorAnalysis.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Init.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_LocalDeepCopy.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_MDRange_a.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_MDRange_b.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_MDRange_c.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_MDRange_d.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_MDRange_e.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Other.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_RangePolicy.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_RangePolicyRequire.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Reducers_a.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Reducers_b.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Reducers_c.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Reducers_d.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Reductions.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Reductions_DeviceView.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Scan.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_a.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_b.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c01.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c02.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c03.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c04.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c05.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c06.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c07.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c08.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c10.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c13.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_SubView_c_all.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_Team.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_TeamReductionScan.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_TeamScratch.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_TeamTeamSize.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_TeamVectorRange.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_UniqueToken.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewAPI_a.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewAPI_b.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewAPI_c.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewAPI_d.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewAPI_e.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewLayoutStrideAssignment.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewMapping_a.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewMapping_b.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewMapping_subview.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewOfClass.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_ViewResize.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_View_64bit.cpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_WorkGraph.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations_complexdouble.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations_complexfloat.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations_double.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations_float.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations_int.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations_longint.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations_longlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations_unsignedint.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations_unsignedlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_AtomicViews.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Complex.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Concepts.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Crs.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_DeepCopyAlignment.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_FunctorAnalysis.cpp rename lib/kokkos/core/unit_test/openmp/{TestOpenMP_RangePolicy.cpp => TestOpenMP_Graph.cpp} (98%) delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Init.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_LocalDeepCopy.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_MDRange_a.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_MDRange_b.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_MDRange_c.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_MDRange_d.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_MDRange_e.cpp rename lib/kokkos/core/unit_test/openmp/{TestOpenMP_Other.cpp => TestOpenMP_PartitionMaster.cpp} (95%) delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_RangePolicyRequire.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Reducers_a.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Reducers_b.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Reducers_c.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Reducers_d.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions_DeviceView.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Scan.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SharedAlloc.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c13.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_TeamReductionScan.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_TeamScratch.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_TeamTeamSize.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_TeamVectorRange.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_a.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_c.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_d.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_e.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewLayoutStrideAssignment.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_a.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_b.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_subview.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewOfClass.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewResize.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_View_64bit.cpp delete mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations_complexdouble.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations_complexfloat.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations_double.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations_float.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations_int.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations_longint.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations_longlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations_unsignedint.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations_unsignedlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicViews.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Atomics.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Complex.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Concepts.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Crs.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_DeepCopyAlignment.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_FunctorAnalysis.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Init.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_MDRange_a.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_MDRange_b.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_MDRange_c.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_MDRange_d.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_MDRange_e.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Other.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_RangePolicy.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_RangePolicyRequire.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Reducers_a.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Reducers_b.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Reducers_c.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Reducers_d.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Reductions.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Scan.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SharedAlloc.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_a.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_b.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c01.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c02.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c03.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c04.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c05.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c06.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c07.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c08.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c09.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c10.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c11.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c12.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c13.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Team.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_TeamScratch.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_UniqueToken.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewAPI_a.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewAPI_b.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewAPI_c.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewAPI_d.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewLayoutStrideAssignment.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_a.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_subview.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewOfClass.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewResize.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_View_64bit.cpp delete mode 100644 lib/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_WorkGraph.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_SharedAlloc.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewAPI_a.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewAPI_b.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewAPI_c.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewAPI_d.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewAPI_e.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewCopy.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_a.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_b.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_subview.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCmHostPinned_View_64bit.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_All.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_AtomicOperations_double.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_AtomicOperations_float.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_AtomicOperations_int.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_AtomicOperations_longint.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_AtomicOperations_longlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_AtomicOperations_unsignedint.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_AtomicOperations_unsignedlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_AtomicViews.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Atomics.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Complex.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Crs.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_DeepCopyAlignment.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Init.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_MDRangeReduce_a.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_MDRangeReduce_b.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_MDRangeReduce_d.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_MDRangeReduce_e.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_MDRange_a.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_MDRange_b.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_MDRange_c.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_MDRange_d.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_MDRange_e.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Other.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_RangePolicy.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Reducers_a.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Reducers_b.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Reducers_c.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Reducers_d.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Reductions.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Scan.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SharedAlloc.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Spaces.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_a.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c01.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c02.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c03.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c04.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c05.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c06.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c07.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c08.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c09.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c10.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c11.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c12.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_SubView_c13.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_Team.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_TeamReductionScan.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_TeamTeamSize.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_ViewAPI_a.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_ViewAPI_b.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_ViewAPI_c.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_ViewAPI_d.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_ViewAPI_e.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_a.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_b.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_subview.cpp delete mode 100644 lib/kokkos/core/unit_test/rocm/TestROCm_ViewOfClass.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_AtomicOperations_complexdouble.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_AtomicOperations_complexfloat.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_AtomicOperations_double.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_AtomicOperations_float.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_AtomicOperations_int.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_AtomicOperations_longint.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_AtomicOperations_longlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_AtomicOperations_unsignedint.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_AtomicOperations_unsignedlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_AtomicViews.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Complex.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Concepts.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Crs.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_DeepCopyAlignment.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_FunctorAnalysis.cpp rename lib/kokkos/core/unit_test/serial/{TestSerial_RangePolicy.cpp => TestSerial_Graph.cpp} (98%) delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Init.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_LocalDeepCopy.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_MDRange_a.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_MDRange_b.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_MDRange_c.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_MDRange_d.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_MDRange_e.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_RangePolicyRequire.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Reducers_a.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Reducers_b.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Reducers_c.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Reducers_d.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Reductions_DeviceView.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Scan.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SharedAlloc.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c13.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Team.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_TeamReductionScan.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_TeamScratch.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_TeamTeamSize.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_TeamVectorRange.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_UniqueToken.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_a.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_c.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_d.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_e.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewLayoutStrideAssignment.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewMapping_a.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewMapping_b.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewMapping_subview.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewOfClass.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewResize.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_View_64bit.cpp delete mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_AtomicOperations_complexdouble.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_AtomicOperations_complexfloat.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_AtomicOperations_double.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_AtomicOperations_float.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_AtomicOperations_int.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_AtomicOperations_longint.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_AtomicOperations_longlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_AtomicOperations_unsignedint.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_AtomicOperations_unsignedlongint.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_AtomicViews.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Complex.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Concepts.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Crs.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_DeepCopyAlignment.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_FunctorAnalysis.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Init.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_LocalDeepCopy.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_MDRange_a.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_MDRange_b.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_MDRange_c.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_MDRange_d.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_MDRange_e.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_RangePolicy.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_RangePolicyRequire.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Reducers_a.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Reducers_b.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Reducers_c.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Reducers_d.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Reductions_DeviceView.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Scan.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SharedAlloc.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c13.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Team.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_TeamReductionScan.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_TeamTeamSize.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_TeamVectorRange.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_UniqueToken.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_a.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_c.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_d.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_e.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewLayoutStrideAssignment.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewMapping_a.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewMapping_b.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewMapping_subview.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewOfClass.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewResize.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_View_64bit.cpp delete mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp rename lib/kokkos/core/unit_test/{cuda/TestCuda_FunctorAnalysis.cpp => tools/TestLogicalSpaces.cpp} (94%) create mode 100644 lib/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp diff --git a/lib/kokkos/BUILD.md b/lib/kokkos/BUILD.md index 7a7e2a8e05..e1f0e3e472 100644 --- a/lib/kokkos/BUILD.md +++ b/lib/kokkos/BUILD.md @@ -65,10 +65,15 @@ which activates the OpenMP backend. All of the options controlling device backen ## Spack An alternative to manually building with the CMake is to use the Spack package manager. -To do so, download the `kokkos-spack` git repo and add to the package list: +Make sure you have downloaded [Spack](https://github.com/spack/spack). +The easiest way to configure the Spack environment is: ````bash -> spack repo add $path-to-kokkos-spack +> source spack/share/spack/setup-env.sh ```` +with other scripts available for other shells. +You can display information about how to install packages with: +````bash +> spack info kokkos A basic installation would be done as: ````bash > spack install kokkos @@ -178,8 +183,8 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`. ## Other Options * Kokkos_CXX_STANDARD - * The C++ standard for Kokkos to use: c++11, c++14, c++17, or c++20. This should be given in CMake style as 11, 14, 17, or 20. - * STRING Default: 11 + * The C++ standard for Kokkos to use: c++14, c++17, or c++20. This should be given in CMake style as 14, 17, or 20. + * STRING Default: 14 ## Third-party Libraries (TPLs) The following options control enabling TPLs: diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index d8baea4c49..2b977bd575 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,104 @@ # Change Log +## [3.3.00](https://github.com/kokkos/kokkos/tree/3.3.00) (2020-12-16) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.01...3.3.00) + +**Features:** +- Require C++14 as minimum C++ standard. C++17 and C++20 are supported too. +- HIP backend is nearly feature complete. Kokkos Dynamic Task Graphs are missing. +- Major update for OpenMPTarget: many capabilities now work. For details contact us. +- Added DPC++/SYCL backend: primary capabilites are working. +- Added Kokkos Graph API analogous to CUDA Graphs. +- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536) +- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546) +- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439) +- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379) + +**Implemented enhancements Backends and Archs:** +- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614) +- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375) +- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583) +- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577) +- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544) +- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550) +- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480) +- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474) +- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451) +- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447) +- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504) +- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411) +- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440) +- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418) +- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366) + +**Implemented enhancements Policies:** +- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494) +- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527) +- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395) +- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362) +- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369) +- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206) +- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509) + +**Implemented enhancements BuildSystem:** +- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488) +- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548) +- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136) +- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434) +- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402) +- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457) + +**Implemented enhancements Tools:** +- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455) +- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530) +- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518) +- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459) +- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326) + +**Implemented enhancements Other:** +- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528) +- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449) +- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436) +- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435) +- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422) +- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416) +- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388) +- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359) +- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357) +- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340) +- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339) +- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338) +- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309) +- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265) +- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941) + +**Fixed bugs:** +- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591) +- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588) +- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566) +- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565) +- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532) +- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529) +- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510) +- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503) +- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467) +- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458) +- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398) +- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393) +- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390) +- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378) +- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348) +- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345) +- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343) +- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260) + +**Incompatibilities:** +- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535) +- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534) +- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301) +- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264) +- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148) + ## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01) @@ -36,37 +135,31 @@ - Windows Cuda support [\#3018](https://github.com/kokkos/kokkos/issues/3018) - Pass `-Wext-lambda-captures-this` to NVCC when support for `__host__ __device__` lambda is enabled from CUDA 11 [\#3241](https://github.com/kokkos/kokkos/issues/3241) - Use explicit staging buffer for constant memory kernel launches and cleanup host/device synchronization [\#3234](https://github.com/kokkos/kokkos/issues/3234) -- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 1: [\#3202](https://github.com/kokkos/kokkos/issues/3202) -- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 2: [\#3203](https://github.com/kokkos/kokkos/issues/3203) -- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 3: [\#3196](https://github.com/kokkos/kokkos/issues/3196) +- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable: [\#3202](https://github.com/kokkos/kokkos/issues/3202) , [\#3203](https://github.com/kokkos/kokkos/issues/3203) , [\#3196](https://github.com/kokkos/kokkos/issues/3196) - Annotations for `DefaultExectutionSpace` and `DefaultHostExectutionSpace` to use in static analysis [\#3189](https://github.com/kokkos/kokkos/issues/3189) - Add documentation on using Spack to install Kokkos and developing packages that depend on Kokkos [\#3187](https://github.com/kokkos/kokkos/issues/3187) -- Improve support for nvcc\_wrapper with exotic host compiler [\#3186](https://github.com/kokkos/kokkos/issues/3186) - Add OpenMPTarget backend flags for NVC++ compiler [\#3185](https://github.com/kokkos/kokkos/issues/3185) - Move deep\_copy/create\_mirror\_view on Experimental::OffsetView into Kokkos:: namespace [\#3166](https://github.com/kokkos/kokkos/issues/3166) - Allow for larger block size in HIP [\#3165](https://github.com/kokkos/kokkos/issues/3165) - View: Added names of Views to the different View initialize/free kernels [\#3159](https://github.com/kokkos/kokkos/issues/3159) - Cuda: Caching cudaFunctorAttributes and whether L1/Shmem prefer was set [\#3151](https://github.com/kokkos/kokkos/issues/3151) -- BuildSystem: Provide an explicit default CMAKE\_BUILD\_TYPE [\#3131](https://github.com/kokkos/kokkos/issues/3131) +- BuildSystem: Improved performance in default configuration by defaulting to Release build [\#3131](https://github.com/kokkos/kokkos/issues/3131) - Cuda: Update CUDA occupancy calculation [\#3124](https://github.com/kokkos/kokkos/issues/3124) - Vector: Adding data() to Vector [\#3123](https://github.com/kokkos/kokkos/issues/3123) - BuildSystem: Add CUDA Ampere configuration support [\#3122](https://github.com/kokkos/kokkos/issues/3122) - General: Apply [[noreturn]] to Kokkos::abort when applicable [\#3106](https://github.com/kokkos/kokkos/issues/3106) - TeamPolicy: Validate storage level argument passed to TeamPolicy::set\_scratch\_size() [\#3098](https://github.com/kokkos/kokkos/issues/3098) -- nvcc\_wrapper: send --cudart to nvcc instead of host compiler [\#3092](https://github.com/kokkos/kokkos/issues/3092) - BuildSystem: Make kokkos\_has\_string() function in Makefile.kokkos case insensitive [\#3091](https://github.com/kokkos/kokkos/issues/3091) - Modify KOKKOS\_FUNCTION macro for clang-tidy analysis [\#3087](https://github.com/kokkos/kokkos/issues/3087) - Move allocation profiling to allocate/deallocate calls [\#3084](https://github.com/kokkos/kokkos/issues/3084) - BuildSystem: FATAL\_ERROR when attempting in-source build [\#3082](https://github.com/kokkos/kokkos/issues/3082) - Change enums in ScatterView to types [\#3076](https://github.com/kokkos/kokkos/issues/3076) - HIP: Changes for new compiler/runtime [\#3067](https://github.com/kokkos/kokkos/issues/3067) -- Extract and use get\_gpu [\#3061](https://github.com/kokkos/kokkos/issues/3061) -- Extract and use get\_gpu [\#3048](https://github.com/kokkos/kokkos/issues/3048) +- Extract and use get\_gpu [\#3061](https://github.com/kokkos/kokkos/issues/3061) , [\#3048](https://github.com/kokkos/kokkos/issues/3048) - Add is\_allocated to View-like containers [\#3059](https://github.com/kokkos/kokkos/issues/3059) - Combined reducers for scalar references [\#3052](https://github.com/kokkos/kokkos/issues/3052) - Add configurable capacity for UniqueToken [\#3051](https://github.com/kokkos/kokkos/issues/3051) - Add installation testing [\#3034](https://github.com/kokkos/kokkos/issues/3034) -- BuildSystem: Add -expt-relaxed-constexpr flag to nvcc\_wrapper [\#3021](https://github.com/kokkos/kokkos/issues/3021) - HIP: Add UniqueToken [\#3020](https://github.com/kokkos/kokkos/issues/3020) - Autodetect number of devices [\#3013](https://github.com/kokkos/kokkos/issues/3013) @@ -82,11 +175,13 @@ - ScatterView: fix for OpenmpTarget remove inheritance from reducers [\#3162](https://github.com/kokkos/kokkos/issues/3162) - BuildSystem: Set OpenMP flags according to host compiler [\#3127](https://github.com/kokkos/kokkos/issues/3127) - OpenMP: Fix logic for nested omp in partition\_master bug [\#3101](https://github.com/kokkos/kokkos/issues/3101) +- nvcc\_wrapper: send --cudart to nvcc instead of host compiler [\#3092](https://github.com/kokkos/kokkos/issues/3092) - BuildSystem: Fixes for Cuda/11 and c++17 [\#3085](https://github.com/kokkos/kokkos/issues/3085) - HIP: Fix print\_configuration [\#3080](https://github.com/kokkos/kokkos/issues/3080) - Conditionally define get\_gpu [\#3072](https://github.com/kokkos/kokkos/issues/3072) - Fix bounds for ranges in random number generator [\#3069](https://github.com/kokkos/kokkos/issues/3069) - Fix Cuda minor arch check [\#3035](https://github.com/kokkos/kokkos/issues/3035) +- BuildSystem: Add -expt-relaxed-constexpr flag to nvcc\_wrapper [\#3021](https://github.com/kokkos/kokkos/issues/3021) **Incompatibilities:** diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index f7fa3e5279..ed9835f91d 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -111,8 +111,8 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) -set(Kokkos_VERSION_MINOR 2) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 3) +set(Kokkos_VERSION_PATCH 0) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -139,13 +139,15 @@ ENDIF() # I really wish these were regular variables # but scoping issues can make it difficult GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS) +GLOBAL_SET(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) GLOBAL_SET(KOKKOS_CUDA_OPTIONS) GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos GLOBAL_SET(KOKKOS_TPL_EXPORTS) +# this could probably be scoped to project +GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS @@ -191,8 +193,6 @@ ELSE() SET(KOKKOS_IS_SUBDIRECTORY FALSE) ENDIF() - - #------------------------------------------------------------------------------ # # A) Forward declare the package so that certain options are also defined for @@ -253,9 +253,7 @@ KOKKOS_PROCESS_SUBPACKAGES() KOKKOS_PACKAGE_DEF() KOKKOS_EXCLUDE_AUTOTOOLS_FILES() KOKKOS_PACKAGE_POSTPROCESS() - -#We are ready to configure the header -CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +KOKKOS_CONFIGURE_CORE() IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) ADD_LIBRARY(kokkos INTERFACE) @@ -272,7 +270,10 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # executables also need nvcc_wrapper. Thus, we need to install it. # If the argument of DESTINATION is a relative path, CMake computes it # as relative to ${CMAKE_INSTALL_PATH}. -INSTALL(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper DESTINATION ${CMAKE_INSTALL_BINDIR}) +# KOKKOS_INSTALL_ADDITIONAL_FILES will install nvcc wrapper and other generated +# files +KOKKOS_INSTALL_ADDITIONAL_FILES() + # Finally - if we are a subproject - make sure the enabled devices are visible IF (HAS_PARENT) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index d8b5a050bd..615132e741 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -11,27 +11,27 @@ CXXFLAGS += $(SHFLAGS) endif KOKKOS_VERSION_MAJOR = 3 -KOKKOS_VERSION_MINOR = 2 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 3 +KOKKOS_VERSION_PATCH = 0 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) -# Options: Cuda,HIP,ROCm,OpenMP,Pthread,Serial +# Options: Cuda,HIP,OpenMP,Pthread,Serial KOKKOS_DEVICES ?= "OpenMP" #KOKKOS_DEVICES ?= "Pthread" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKX # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80 -# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2 +# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 -# AMD-GPUS: Vega900,Vega906 +# AMD-GPUS: Vega900,Vega906,Vega908 # AMD-CPUS: AMDAVX,Zen,Zen2 KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" # Options: hwloc,librt,experimental_memkind KOKKOS_USE_TPLS ?= "" -# Options: c++11,c++14,c++1y,c++17,c++1z,c++2a -KOKKOS_CXX_STANDARD ?= "c++11" +# Options: c++14,c++1y,c++17,c++1z,c++2a +KOKKOS_CXX_STANDARD ?= "c++14" # Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align KOKKOS_OPTIONS ?= "" KOKKOS_CMAKE ?= "no" @@ -66,7 +66,6 @@ kokkos_path_exists=$(if $(wildcard $1),1,0) # Check for general settings KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes) -KOKKOS_INTERNAL_ENABLE_CXX11 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++11) KOKKOS_INTERNAL_ENABLE_CXX14 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++14) KOKKOS_INTERNAL_ENABLE_CXX1Y := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1y) KOKKOS_INTERNAL_ENABLE_CXX17 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++17) @@ -279,14 +278,12 @@ else endif endif -# Set C++11 flags. +# Set C++ version flags. ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) - KOKKOS_INTERNAL_CXX11_FLAG := --c++11 KOKKOS_INTERNAL_CXX14_FLAG := --c++14 KOKKOS_INTERNAL_CXX17_FLAG := --c++17 else ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) - KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11 KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14 KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y #KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17 @@ -294,23 +291,17 @@ else #KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a else ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) - KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11 KOKKOS_INTERNAL_CXX14_FLAG := -hstd=c++14 #KOKKOS_INTERNAL_CXX1Y_FLAG := -hstd=c++1y #KOKKOS_INTERNAL_CXX17_FLAG := -hstd=c++17 #KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z #KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a else - ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1) - KOKKOS_INTERNAL_CXX11_FLAG := - else - KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11 - KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14 - KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y - KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17 - KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z - KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a - endif + KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14 + KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y + KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17 + KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z + KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a endif endif endif @@ -377,7 +368,8 @@ KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8 KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv81) KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-ThunderX) KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-TX2) -KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2) | bc)) +KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX) +KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc)) # IBM based. KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ) @@ -392,6 +384,7 @@ KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900) KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906) +KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908) # Any AVX? KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) @@ -459,7 +452,6 @@ H := \# # Do not append first line tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp) tmp := $(call kokkos_append_header,"Makefile constructed configuration:") -tmp := $(call kokkos_append_header,"$(shell date)") tmp := $(call kokkos_append_header,"----------------------------------------------*/") tmp := $(call kokkos_append_header,'$H''if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)') @@ -479,10 +471,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)") endif -ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) - tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_ROCM') - tmp := $(call kokkos_append_header,'$H''define KOKKOS_IMPL_ROCM_CLANG_WORKAROUND 1') -endif ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP') endif @@ -542,12 +530,6 @@ endif #only add the c++ standard flags if this is not CMake tmp := $(call kokkos_append_header,"/* General Settings */") -ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1) -ifneq ($(KOKKOS_STANDALONE_CMAKE), yes) - KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG) -endif - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX11") -endif ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1) ifneq ($(KOKKOS_STANDALONE_CMAKE), yes) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG) @@ -765,6 +747,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_A64FX") + + KOKKOS_CXXFLAGS += -march=armv8.2-a+sve + KOKKOS_LDFLAGS += -march=armv8.2-a+sve +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2") @@ -1143,6 +1132,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx906 endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA908), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 908") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908 + endif + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) @@ -1173,6 +1168,55 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h) endif +# Functions for generating config header file +kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1) +kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3) +kokkos_append_config_header = $(shell echo $1 >> $2)) +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp") +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp") +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp") +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp") +tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") +tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") +tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") +tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_SetupBackend.hpp") + ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) + else + endif +endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_SetupBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") +endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) @@ -1290,7 +1334,7 @@ ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) endif # With Cygwin functions such as fdopen and fileno are not defined -# when strict ansi is enabled. strict ansi gets enabled with --std=c++11 +# when strict ansi is enabled. strict ansi gets enabled with --std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects # This is needed for gtest actually, not for Kokkos itself! ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1) @@ -1313,7 +1357,9 @@ KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) include $(KOKKOS_PATH)/Makefile.targets kokkos-clean: - rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a + rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ + KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \ + KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) ar cr libkokkos.a $(KOKKOS_OBJ_LINK) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index 525962d2d5..5a03f7d17e 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -53,23 +53,10 @@ Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp -Kokkos_HIP_KernelLaunch.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_KernelLaunch.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_KernelLaunch.cpp Kokkos_HIP_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp endif -ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) -Kokkos_ROCm_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Exec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Exec.cpp -Kokkos_ROCm_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Space.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Space.cpp -Kokkos_ROCm_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp -Kokkos_ROCm_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp -endif - ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md index b67830fde4..d55ef2caac 100644 --- a/lib/kokkos/README.md +++ b/lib/kokkos/README.md @@ -54,24 +54,16 @@ For specifics see the LICENSE file contained in the repository or distribution. # Requirements ### Primary tested compilers on X86 are: -* GCC 4.8.4 -* GCC 4.9.3 -* GCC 5.1.0 +* GCC 5.3.0 * GCC 5.4.0 * GCC 5.5.0 * GCC 6.1.0 * GCC 7.2.0 * GCC 7.3.0 * GCC 8.1.0 -* Intel 15.0.2 -* Intel 16.0.1 * Intel 17.0.1 * Intel 17.4.196 * Intel 18.2.128 -* Clang 3.6.1 -* Clang 3.7.1 -* Clang 3.8.1 -* Clang 3.9.0 * Clang 4.0.0 * Clang 6.0.0 for CUDA (CUDA Toolkit 9.0) * Clang 7.0.0 for CUDA (CUDA Toolkit 9.1) @@ -81,6 +73,7 @@ For specifics see the LICENSE file contained in the repository or distribution. * NVCC 9.2 for CUDA (with gcc 7.2.0) * NVCC 10.0 for CUDA (with gcc 7.4.0) * NVCC 10.1 for CUDA (with gcc 7.4.0) +* NVCC 11.0 for CUDA (with gcc 8.4.0) ### Primary tested compilers on Power 8 are: * GCC 6.4.0 (OpenMP,Serial) @@ -89,9 +82,8 @@ For specifics see the LICENSE file contained in the repository or distribution. * NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0) ### Primary tested compilers on Intel KNL are: -* Intel 16.4.258 (with gcc 4.7.2) -* Intel 17.2.174 (with gcc 4.9.3) -* Intel 18.2.199 (with gcc 4.9.3) +* Intel 17.2.174 (with gcc 6.2.0 and 6.4.0) +* Intel 18.2.199 (with gcc 6.2.0 and 6.4.0) ### Primary tested compilers on ARM (Cavium ThunderX2) * GCC 7.2.0 diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 40d8db2663..69d6cf8f35 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -806,7 +806,7 @@ class Random_XorShift64 { const double V = 2.0 * drand() - 1.0; S = U * U + V * V; } - return U * std::sqrt(-2.0 * log(S) / S); + return U * std::sqrt(-2.0 * std::log(S) / S); } KOKKOS_INLINE_FUNCTION @@ -1042,7 +1042,7 @@ class Random_XorShift1024 { const double V = 2.0 * drand() - 1.0; S = U * U + V * V; } - return U * std::sqrt(-2.0 * log(S) / S); + return U * std::sqrt(-2.0 * std::log(S) / S); } KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp index a95b652eab..d17c02776f 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -222,12 +222,12 @@ class BinSort { "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins()); bin_count_const = bin_count_atomic; bin_offsets = - offset_type(ViewAllocateWithoutInitializing( - "Kokkos::SortImpl::BinSortFunctor::bin_offsets"), + offset_type(view_alloc(WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::bin_offsets"), bin_op.max_bins()); sort_order = - offset_type(ViewAllocateWithoutInitializing( - "Kokkos::SortImpl::BinSortFunctor::sort_order"), + offset_type(view_alloc(WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::sort_order"), range_end - range_begin); } @@ -279,8 +279,8 @@ class BinSort { } scratch_view_type sorted_values( - ViewAllocateWithoutInitializing( - "Kokkos::SortImpl::BinSortFunctor::sorted_values"), + view_alloc(WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::sorted_values"), values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG, values.rank_dynamic > 1 ? values.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index 969e67c41b..819c9e54ba 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -24,7 +24,7 @@ KOKKOS_ADD_TEST_LIBRARY( # avoid deprecation warnings from MSVC TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0) -IF(NOT (Kokkos_ENABLE_CUDA AND WIN32)) +IF((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11) ENDIF() diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index 4a192b08ec..c112d7c6fc 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -31,10 +31,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) TEST_TARGETS += test-cuda endif -ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) - OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o - TARGETS += KokkosAlgorithms_UnitTest_ROCm - TEST_TARGETS += test-rocm +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + OBJ_HIP = TestHIP.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_HIP + TEST_TARGETS += test-hip endif ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) @@ -64,8 +64,8 @@ endif KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda -KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm +KokkosAlgorithms_UnitTest_HIP: $(OBJ_HIP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_HIP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_HIP KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads @@ -82,8 +82,8 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) test-cuda: KokkosAlgorithms_UnitTest_Cuda ./KokkosAlgorithms_UnitTest_Cuda -test-rocm: KokkosAlgorithms_UnitTest_ROCm - ./KokkosAlgorithms_UnitTest_ROCm +test-hip: KokkosAlgorithms_UnitTest_HIP + ./KokkosAlgorithms_UnitTest_HIP test-threads: KokkosAlgorithms_UnitTest_Threads ./KokkosAlgorithms_UnitTest_Threads diff --git a/lib/kokkos/benchmarks/atomic/Makefile b/lib/kokkos/benchmarks/atomic/Makefile index 64b43917de..636c0ad4ab 100644 --- a/lib/kokkos/benchmarks/atomic/Makefile +++ b/lib/kokkos/benchmarks/atomic/Makefile @@ -1,31 +1,38 @@ -KOKKOS_PATH = ${HOME}/kokkos -KOKKOS_DEVICES = "OpenMP" -KOKKOS_ARCH = "SNB" -EXE_NAME = "test" +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" -SRC = $(wildcard *.cpp) + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" - ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper -EXE = ${EXE_NAME}.cuda -KOKKOS_CUDA_OPTIONS = "enable_lambda" +EXE = atomic_perf.cuda else CXX = g++ -EXE = ${EXE_NAME}.host +EXE = atomic_perf.exe endif -CXXFLAGS = -O3 - -LINK = ${CXX} -LINKFLAGS = -O3 +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = -OBJ = $(SRC:.cpp=.o) +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos @@ -35,10 +42,10 @@ build: $(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) -clean: kokkos-clean - rm -f *.o *.cuda *.host +clean: kokkos-clean + rm -f *.o atomic_perf.cuda atomic_perf.exe # Compilation rules -%.o:%.cpp $(KOKKOS_CPP_DEPENDS) - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash b/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash index 9dded535e8..4fcac3df9f 100755 --- a/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash +++ b/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash @@ -9,7 +9,7 @@ if [[ ${USE_CUDA} > 0 ]]; then BAF_EXE=bytes_and_flops.cuda TEAM_SIZE=256 else - BAF_EXE=bytes_and_flops.host + BAF_EXE=bytes_and_flops.exe TEAM_SIZE=1 fi diff --git a/lib/kokkos/benchmarks/bytes_and_flops/Makefile b/lib/kokkos/benchmarks/bytes_and_flops/Makefile index 6cbef56ff0..1aa4edddcd 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/Makefile +++ b/lib/kokkos/benchmarks/bytes_and_flops/Makefile @@ -1,6 +1,6 @@ KOKKOS_DEVICES=Cuda KOKKOS_CUDA_OPTIONS=enable_lambda -KOKKOS_ARCH = "SNB,Kepler35" +KOKKOS_ARCH = "SNB,Volta70" MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) @@ -22,7 +22,7 @@ CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper EXE = bytes_and_flops.cuda else CXX = g++ -EXE = bytes_and_flops.host +EXE = bytes_and_flops.exe endif CXXFLAGS ?= -O3 -g diff --git a/lib/kokkos/benchmarks/gather/Makefile b/lib/kokkos/benchmarks/gather/Makefile index 0ea9fb1dd2..6827995bed 100644 --- a/lib/kokkos/benchmarks/gather/Makefile +++ b/lib/kokkos/benchmarks/gather/Makefile @@ -1,7 +1,18 @@ -KOKKOS_PATH = ${HOME}/kokkos -SRC = $(wildcard *.cpp) KOKKOS_DEVICES=Cuda KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" @@ -9,36 +20,32 @@ default: build ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper EXE = gather.cuda -KOKKOS_DEVICES = "Cuda,OpenMP" -KOKKOS_ARCH = "SNB,Kepler35" else CXX = g++ -EXE = gather.host -KOKKOS_DEVICES = "OpenMP" -KOKKOS_ARCH = "SNB" +EXE = gather.exe endif -CXXFLAGS = -O3 -g +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) DEPFLAGS = -M LINK = ${CXX} LINKFLAGS = -OBJ = $(SRC:.cpp=.o) +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos -$(warning ${KOKKOS_CPPFLAGS}) build: $(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o gather.cuda gather.exe # Compilation rules -%.o:%.cpp $(KOKKOS_CPP_DEPENDS) gather_unroll.hpp gather.hpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/gups/Makefile b/lib/kokkos/benchmarks/gups/Makefile index 7176111664..2a90621d8c 100644 --- a/lib/kokkos/benchmarks/gups/Makefile +++ b/lib/kokkos/benchmarks/gups/Makefile @@ -1,28 +1,38 @@ -#Set your Kokkos path to something appropriate -KOKKOS_PATH = ${HOME}/git/kokkos-github-repo -KOKKOS_DEVICES = "Cuda" -KOKKOS_ARCH = "Pascal60" -KOKKOS_CUDA_OPTIONS = enable_lambda -#KOKKOS_DEVICES = "OpenMP" -#KOKKOS_ARCH = "Power8" +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" -SRC = gups-kokkos.cc + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" - -CXXFLAGS = -O3 -CXX = ${HOME}/git/kokkos-github-repo/bin/nvcc_wrapper -#CXX = g++ -LINK = ${CXX} +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = gups.cuda +else +CXX = g++ +EXE = gups.exe +endif -LINKFLAGS = -EXE = gups-kokkos +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = -OBJ = $(SRC:.cc=.o) +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos @@ -32,10 +42,10 @@ build: $(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) -clean: kokkos-clean - rm -f *.o $(EXE) +clean: kokkos-clean + rm -f *.o gups.cuda gups.exe # Compilation rules -%.o:%.cc $(KOKKOS_CPP_DEPENDS) - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/gups/gups-kokkos.cc b/lib/kokkos/benchmarks/gups/gups-kokkos.cpp similarity index 100% rename from lib/kokkos/benchmarks/gups/gups-kokkos.cc rename to lib/kokkos/benchmarks/gups/gups-kokkos.cpp diff --git a/lib/kokkos/benchmarks/policy_performance/Makefile b/lib/kokkos/benchmarks/policy_performance/Makefile index 13aef3209c..f50aea720e 100644 --- a/lib/kokkos/benchmarks/policy_performance/Makefile +++ b/lib/kokkos/benchmarks/policy_performance/Makefile @@ -1,31 +1,38 @@ -KOKKOS_PATH = ../.. -SRC = $(wildcard *.cpp) +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper -CXXFLAGS = -O3 -g -LINK = ${CXX} -LINKFLAGS = -EXE = policy_performance.cuda -KOKKOS_DEVICES = "Cuda,OpenMP" -KOKKOS_ARCH = "SNB,Kepler35" -KOKKOS_CUDA_OPTIONS+=enable_lambda +EXE = policy_perf.cuda else CXX = g++ -CXXFLAGS = -O3 -g -Wall -Werror -LINK = ${CXX} -LINKFLAGS = -EXE = policy_performance.host -KOKKOS_DEVICES = "OpenMP" -KOKKOS_ARCH = "SNB" +EXE = policy_perf.exe endif -DEPFLAGS = -M +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) -OBJ = $(SRC:.cpp=.o) +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos @@ -35,10 +42,10 @@ build: $(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) -clean: kokkos-clean - rm -f *.o *.cuda *.host +clean: kokkos-clean + rm -f *.o policy_perf.cuda policy_perf.exe # Compilation rules -%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/policy_performance/main.cpp b/lib/kokkos/benchmarks/policy_performance/main.cpp index 5b04c6ab93..da49cdb019 100644 --- a/lib/kokkos/benchmarks/policy_performance/main.cpp +++ b/lib/kokkos/benchmarks/policy_performance/main.cpp @@ -146,11 +146,11 @@ int main(int argc, char* argv[]) { // Call a 'warmup' test with 1 repeat - this will initialize the corresponding // view appropriately for test and should obey first-touch etc Second call to // test is the one we actually care about and time - view_type_1d v_1(Kokkos::ViewAllocateWithoutInitializing("v_1"), + view_type_1d v_1(Kokkos::view_alloc(Kokkos::WithoutInitializing, "v_1"), team_range * team_size); - view_type_2d v_2(Kokkos::ViewAllocateWithoutInitializing("v_2"), + view_type_2d v_2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "v_2"), team_range * team_size, thread_range); - view_type_3d v_3(Kokkos::ViewAllocateWithoutInitializing("v_3"), + view_type_3d v_3(Kokkos::view_alloc(Kokkos::WithoutInitializing, "v_3"), team_range * team_size, thread_range, vector_range); double result_computed = 0.0; diff --git a/lib/kokkos/benchmarks/stream/Makefile b/lib/kokkos/benchmarks/stream/Makefile index 04566b322d..47a13838a4 100644 --- a/lib/kokkos/benchmarks/stream/Makefile +++ b/lib/kokkos/benchmarks/stream/Makefile @@ -1,28 +1,38 @@ -#Set your Kokkos path to something appropriate -KOKKOS_PATH = ${HOME}/git/kokkos-github-repo -#KOKKOS_DEVICES = "Cuda" -#KOKKOS_ARCH = "Pascal60" -#KOKKOS_CUDA_OPTIONS = enable_lambda -KOKKOS_DEVICES = "OpenMP" -KOKKOS_ARCH = "Power8" +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" -SRC = stream-kokkos.cc + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" - -CXXFLAGS = -O3 -#CXX = ${HOME}/git/kokkos-github-repo/bin/nvcc_wrapper + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = stream.cuda +else CXX = g++ +EXE = stream.exe +endif -LINK = ${CXX} - -LINKFLAGS = -EXE = stream-kokkos +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = -OBJ = $(SRC:.cc=.o) +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos @@ -32,10 +42,10 @@ build: $(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) -clean: kokkos-clean - rm -f *.o $(EXE) +clean: kokkos-clean + rm -f *.o stream.cuda stream.exe # Compilation rules -%.o:%.cc $(KOKKOS_CPP_DEPENDS) - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/stream/stream-kokkos.cc b/lib/kokkos/benchmarks/stream/stream-kokkos.cpp similarity index 100% rename from lib/kokkos/benchmarks/stream/stream-kokkos.cc rename to lib/kokkos/benchmarks/stream/stream-kokkos.cpp diff --git a/lib/kokkos/bin/kokkos_launch_compiler b/lib/kokkos/bin/kokkos_launch_compiler new file mode 100755 index 0000000000..1fbebf648f --- /dev/null +++ b/lib/kokkos/bin/kokkos_launch_compiler @@ -0,0 +1,87 @@ +#!/bin/bash -e +# +# This script allows CMAKE_CXX_COMPILER to be a standard +# C++ compiler and Kokkos sets RULE_LAUNCH_COMPILE and +# RULE_LAUNCH_LINK in CMake so that all compiler and link +# commands are prefixed with this script followed by the +# C++ compiler. Thus if $1 == $2 then we know the command +# was intended for the C++ compiler and we discard both +# $1 and $2 and redirect the command to NVCC_WRAPPER. +# If $1 != $2 then we know that the command was not intended +# for the C++ compiler and we just discard $1 and launch +# the original command. Examples of when $2 will not equal +# $1 are 'ar', 'cmake', etc. during the linking phase +# + +# check the arguments for the KOKKOS_DEPENDENCE compiler definition +KOKKOS_DEPENDENCE=0 +for i in ${@} +do + if [ -n "$(echo ${i} | grep 'KOKKOS_DEPENDENCE$')" ]; then + KOKKOS_DEPENDENCE=1 + break + fi +done + +# if C++ is not passed, someone is probably trying to invoke it directly +if [ -z "${1}" ]; then + echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the first argument." + echo "This script is not indended to be directly invoked by any mechanism other" + echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake\n" + exit 1 +fi + +# if there aren't two args, this isn't necessarily invalid, just a bit strange +if [ -z "${2}" ]; then exit 0; fi + +# store the expected C++ compiler +CXX_COMPILER=${1} + +# remove the expected C++ compiler from the arguments +shift + +# after the above shift, $1 is now the exe for the compile or link command, e.g. +# kokkos_launch_compiler g++ gcc -c file.c -o file.o +# becomes: +# kokkos_launch_compiler gcc -c file.c -o file.o +# Check to see if the executable is the C++ compiler and if it is not, then +# just execute the command. +# +# Summary: +# kokkos_launch_compiler g++ gcc -c file.c -o file.o +# results in this command being executed: +# gcc -c file.c -o file.o +# and +# kokkos_launch_compiler g++ g++ -c file.cpp -o file.o +# results in this command being executed: +# nvcc_wrapper -c file.cpp -o file.o +if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then + # the command does not depend on Kokkos so just execute the command w/o re-directing to nvcc_wrapper + eval $@ +else + # the executable is the C++ compiler, so we need to re-direct to nvcc_wrapper + + # find the nvcc_wrapper from the same build/install + NVCC_WRAPPER="$(dirname ${BASH_SOURCE[0]})/nvcc_wrapper" + + if [ -z "${NVCC_WRAPPER}" ]; then + echo -e "\nError: nvcc_wrapper not found in $(dirname ${BASH_SOURCE[0]}).\n" + exit 1 + fi + + # set default nvcc wrapper compiler if not specified + : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}} + export NVCC_WRAPPER_DEFAULT_COMPILER + + # calling itself will cause an infinitely long build + if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then + echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n" + exit 1 + fi + + # discard the compiler from the command + shift + + # execute nvcc_wrapper + ${NVCC_WRAPPER} $@ +fi diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index bc213497bf..4ecf4c66d5 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -90,7 +90,12 @@ replace_pragma_ident=0 # Mark first host compiler argument first_xcompiler_arg=1 -temp_dir=${TMPDIR:-/tmp} +# Allow for setting temp dir without setting TMPDIR in parent (see https://docs.olcf.ornl.gov/systems/summit_user_guide.html#setting-tmpdir-causes-jsm-jsrun-errors-job-state-flip-flop) +if [[ ! -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then + temp_dir=${TMPDIR:-/tmp} +else + temp_dir=${NVCC_WRAPPER_TMPDIR+x} +fi # optimization flag added as a command-line argument optimization_flag="" @@ -194,7 +199,7 @@ do cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument - -rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart) + -rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include) cuda_args="$cuda_args $1 $2" shift ;; diff --git a/lib/kokkos/cmake/KokkosConfig.cmake.in b/lib/kokkos/cmake/KokkosConfig.cmake.in index 6f4607687e..9fbd22ee5c 100644 --- a/lib/kokkos/cmake/KokkosConfig.cmake.in +++ b/lib/kokkos/cmake/KokkosConfig.cmake.in @@ -1,3 +1,9 @@ +# No need for policy push/pop. CMake also manages a new entry for scripts +# loaded by include() and find_package() commands except when invoked with +# the NO_POLICY_SCOPE option +# CMP0057 + NEW -> IN_LIST operator in IF(...) +CMAKE_POLICY(SET CMP0057 NEW) + # Compute paths @PACKAGE_INIT@ @@ -12,3 +18,18 @@ GET_FILENAME_COMPONENT(Kokkos_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) INCLUDE("${Kokkos_CMAKE_DIR}/KokkosTargets.cmake") INCLUDE("${Kokkos_CMAKE_DIR}/KokkosConfigCommon.cmake") UNSET(Kokkos_CMAKE_DIR) + +# if CUDA was enabled and separable compilation was specified, e.g. +# find_package(Kokkos COMPONENTS separable_compilation) +# then we set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK +IF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) + # run test to see if CMAKE_CXX_COMPILER=nvcc_wrapper + kokkos_compiler_is_nvcc(IS_NVCC ${CMAKE_CXX_COMPILER}) + # if not nvcc_wrapper, use RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK + IF(NOT IS_NVCC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL Clang AND + (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER)) + MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to nvcc_wrapper") + kokkos_compilation(GLOBAL) + ENDIF() + UNSET(IS_NVCC) # be mindful of the environment, pollution is bad +ENDIF() diff --git a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in index 8e664b27a3..42c755c215 100644 --- a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in +++ b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in @@ -89,3 +89,73 @@ function(kokkos_check) set(${KOKKOS_CHECK_RETURN_VALUE} ${KOKKOS_CHECK_SUCCESS} PARENT_SCOPE) endif() endfunction() + +# this function is provided to easily select which files use nvcc_wrapper: +# +# GLOBAL --> all files +# TARGET --> all files in a target +# SOURCE --> specific source files +# DIRECTORY --> all files in directory +# PROJECT --> all files/targets in a project/subproject +# +FUNCTION(kokkos_compilation) + CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + + # search relative first and then absolute + SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@") + + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${_HINTS} + PATHS ${_HINTS} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") + ENDIF() + + IF(COMP_GLOBAL) + # if global, don't bother setting others + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + ELSE() + FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + UNSET(COMP_${_TYPE}) + ENDIF() + # set the properties if defined + IF(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + ENDIF() + ENDFOREACH() + ENDIF() +ENDFUNCTION() + +# A test to check whether a downstream project set the C++ compiler to NVCC or not +# this is called only when Kokkos was installed with Kokkos_ENABLE_CUDA=ON +FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER) + # Check if the compiler is nvcc (which really means nvcc_wrapper). + EXECUTE_PROCESS(COMMAND ${COMPILER} ${ARGN} --version + OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RET) + # something went wrong + IF(RET GREATER 0) + SET(${VAR} false PARENT_SCOPE) + ELSE() + STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + SET(${VAR} true PARENT_SCOPE) + ELSE() + SET(${VAR} false PARENT_SCOPE) + ENDIF() + ENDIF() +ENDFUNCTION() + diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_AtomicViews.cpp b/lib/kokkos/cmake/KokkosCore_Config_HeaderSet.in similarity index 96% rename from lib/kokkos/core/unit_test/cuda/TestCuda_AtomicViews.cpp rename to lib/kokkos/cmake/KokkosCore_Config_HeaderSet.in index 56c5fe6f3e..8d1eee31b2 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_AtomicViews.cpp +++ b/lib/kokkos/cmake/KokkosCore_Config_HeaderSet.in @@ -1,4 +1,3 @@ - /* //@HEADER // ************************************************************************ @@ -42,6 +41,9 @@ // ************************************************************************ //@HEADER */ +#ifndef @HEADER_GUARD_TAG@ +#define @HEADER_GUARD_TAG@ -#include -#include +@INCLUDE_NEXT_FILE@ + +#endif diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index c0362e4fb0..0259fe69d5 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -21,6 +21,7 @@ #cmakedefine KOKKOS_ENABLE_HPX #cmakedefine KOKKOS_ENABLE_MEMKIND #cmakedefine KOKKOS_ENABLE_LIBRT +#cmakedefine KOKKOS_ENABLE_SYCL #ifndef __CUDA_ARCH__ #cmakedefine KOKKOS_ENABLE_TM @@ -31,7 +32,6 @@ #endif /* General Settings */ -#cmakedefine KOKKOS_ENABLE_CXX11 #cmakedefine KOKKOS_ENABLE_CXX14 #cmakedefine KOKKOS_ENABLE_CXX17 #cmakedefine KOKKOS_ENABLE_CXX20 @@ -58,7 +58,7 @@ /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC #cmakedefine KOKKOS_USE_LIBRT -#cmakedefine KOKKOS_ENABLE_HWBSPACE +#cmakedefine KOKKOS_ENABLE_HBWSPACE #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND diff --git a/lib/kokkos/cmake/README.md b/lib/kokkos/cmake/README.md index 6d0cc2daf1..385bbfcd5d 100644 --- a/lib/kokkos/cmake/README.md +++ b/lib/kokkos/cmake/README.md @@ -73,20 +73,20 @@ Compiler features are more fine-grained and require conflicting requests to be r Suppose I have ```` add_library(A a.cpp) -target_compile_features(A PUBLIC cxx_std_11) +target_compile_features(A PUBLIC cxx_std_14) ```` then another target ```` add_library(B b.cpp) -target_compile_features(B PUBLIC cxx_std_14) +target_compile_features(B PUBLIC cxx_std_17) target_link_libraries(A B) ```` I have requested two different features. -CMake understands the requests and knows that `cxx_std_11` is a subset of `cxx_std_14`. -CMake then picks C++14 for library `B`. +CMake understands the requests and knows that `cxx_std_14` is a subset of `cxx_std_17`. +CMake then picks C++17 for library `B`. CMake would not have been able to do feature resolution if we had directly done: ```` -target_compile_options(A PUBLIC -std=c++11) +target_compile_options(A PUBLIC -std=c++14) ```` ### Adding Kokkos Options diff --git a/lib/kokkos/cmake/deps/CUDA.cmake b/lib/kokkos/cmake/deps/CUDA.cmake index 4876bca259..beaf4e6d6c 100644 --- a/lib/kokkos/cmake/deps/CUDA.cmake +++ b/lib/kokkos/cmake/deps/CUDA.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/deps/CUSPARSE.cmake b/lib/kokkos/cmake/deps/CUSPARSE.cmake index b2420d1168..073c40d814 100644 --- a/lib/kokkos/cmake/deps/CUSPARSE.cmake +++ b/lib/kokkos/cmake/deps/CUSPARSE.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/deps/HWLOC.cmake b/lib/kokkos/cmake/deps/HWLOC.cmake index ed89c8c1e5..f8402db00a 100644 --- a/lib/kokkos/cmake/deps/HWLOC.cmake +++ b/lib/kokkos/cmake/deps/HWLOC.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/deps/Pthread.cmake b/lib/kokkos/cmake/deps/Pthread.cmake index 5f835fc300..639e4ef697 100644 --- a/lib/kokkos/cmake/deps/Pthread.cmake +++ b/lib/kokkos/cmake/deps/Pthread.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/fake_tribits.cmake b/lib/kokkos/cmake/fake_tribits.cmake index db7680f56a..2e82a46235 100644 --- a/lib/kokkos/cmake/fake_tribits.cmake +++ b/lib/kokkos/cmake/fake_tribits.cmake @@ -38,12 +38,6 @@ MACRO(GLOBAL_SET VARNAME) SET(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) ENDMACRO() -FUNCTION(VERIFY_EMPTY CONTEXT) -if(${ARGN}) -MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") -endif() -ENDFUNCTION() - MACRO(PREPEND_GLOBAL_SET VARNAME) ASSERT_DEFINED(${VARNAME}) GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) @@ -89,7 +83,7 @@ FUNCTION(KOKKOS_ADD_TEST) CMAKE_PARSE_ARGUMENTS(TEST "" "EXE;NAME;TOOL" - "" + "ARGS" ${ARGN}) IF(TEST_EXE) SET(EXE_ROOT ${TEST_EXE}) @@ -102,6 +96,7 @@ FUNCTION(KOKKOS_ADD_TEST) NAME ${TEST_NAME} COMM serial mpi NUM_MPI_PROCS 1 + ARGS ${TEST_ARGS} ${TEST_UNPARSED_ARGUMENTS} ADDED_TESTS_NAMES_OUT ALL_TESTS_ADDED ) @@ -110,18 +105,25 @@ FUNCTION(KOKKOS_ADD_TEST) SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - foreach(TEST_ADDED ${ALL_TESTS_ADDED}) - set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - endforeach() + # The function TRIBITS_ADD_TEST() has a CATEGORIES argument that defaults + # to BASIC. If a project elects to only enable tests marked as PERFORMANCE, + # the test won't actually be added and attempting to set a property on it below + # will yield an error. + if(TARGET ${EXE}) + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + foreach(TEST_ADDED ${ALL_TESTS_ADDED}) + set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") + endforeach() + endif() endif() else() CMAKE_PARSE_ARGUMENTS(TEST "WILL_FAIL" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" - "CATEGORIES;CMD_ARGS" + "CATEGORIES;ARGS" ${ARGN}) + SET(TESTS_ADDED) # To match Tribits, we should always be receiving # the root names of exes/libs IF(TEST_EXE) @@ -133,24 +135,46 @@ FUNCTION(KOKKOS_ADD_TEST) # These should be the full target name SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_CMD_ARGS}) + IF (TEST_ARGS) + SET(TEST_NUMBER 0) + FOREACH (ARG_STR ${TEST_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} + COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${ARG_STR_LIST}) + ELSE() + ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} COMMAND ${EXE} ${ARG_STR_LIST}) + ENDIF() + LIST(APPEND TESTS_ADDED "${TEST_NAME}${TEST_NUMBER}") + MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + ENDFOREACH() ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_CMD_ARGS}) + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} + COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX}) + ELSE() + ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE}) + ENDIF() + LIST(APPEND TESTS_ADDED "${TEST_NAME}") ENDIF() - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - endif() + + FOREACH(TEST_NAME ${TESTS_ADDED}) + IF(TEST_WILL_FAIL) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + ENDIF() + IF(TEST_FAIL_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_PASS_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + ENDIF() + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") + endif() + ENDFOREACH() VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) endif() ENDFUNCTION() diff --git a/lib/kokkos/cmake/intel.cmake b/lib/kokkos/cmake/intel.cmake index f36f01d8ca..7e6ee3358c 100644 --- a/lib/kokkos/cmake/intel.cmake +++ b/lib/kokkos/cmake/intel.cmake @@ -3,7 +3,7 @@ FUNCTION(kokkos_set_intel_flags full_standard int_standard) STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) STRING(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from - # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. + # /Modules/Compiler/Intel-CXX.cmake from CMake 3.18.1 and then modified. IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) SET(_std -Qstd) SET(_ext c++) @@ -11,20 +11,8 @@ FUNCTION(kokkos_set_intel_flags full_standard int_standard) SET(_std -std) SET(_ext gnu++) ENDIF() - - IF(NOT KOKKOS_CXX_STANDARD STREQUAL 11 AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) - #There is no gnu++14 value supported; figure out what to do. - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "${_std}=c++${INT_LC_STANDARD}" PARENT_SCOPE) - ELSEIF(KOKKOS_CXX_STANDARD STREQUAL 11 AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0) - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=${_ext}c++11" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++11" PARENT_SCOPE) - ENDIF() - ELSE() - MESSAGE(FATAL_ERROR "Intel compiler version too low - need 13.0 for C++11 and 15.0 for C++14") - ENDIF() - + SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) ENDFUNCTION() + diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index d7d32f661c..53aaf7dccf 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -35,6 +35,7 @@ KOKKOS_ARCH_OPTION(ARMV80 HOST "ARMv8.0 Compatible CPU") KOKKOS_ARCH_OPTION(ARMV81 HOST "ARMv8.1 Compatible CPU") KOKKOS_ARCH_OPTION(ARMV8_THUNDERX HOST "ARMv8 Cavium ThunderX CPU") KOKKOS_ARCH_OPTION(ARMV8_THUNDERX2 HOST "ARMv8 Cavium ThunderX2 CPU") +KOKKOS_ARCH_OPTION(A64FX HOST "ARMv8.2 with SVE Suport") KOKKOS_ARCH_OPTION(WSM HOST "Intel Westmere CPU") KOKKOS_ARCH_OPTION(SNB HOST "Intel Sandy/Ivy Bridge CPUs") KOKKOS_ARCH_OPTION(HSW HOST "Intel Haswell CPUs") @@ -63,6 +64,7 @@ KOKKOS_ARCH_OPTION(ZEN HOST "AMD Zen architecture") KOKKOS_ARCH_OPTION(ZEN2 HOST "AMD Zen2 architecture") KOKKOS_ARCH_OPTION(VEGA900 GPU "AMD GPU MI25 GFX900") KOKKOS_ARCH_OPTION(VEGA906 GPU "AMD GPU MI50/MI60 GFX906") +KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU") KOKKOS_ARCH_OPTION(INTEL_GEN GPU "Intel GPUs Gen9+") @@ -72,6 +74,11 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS) "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic" "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") + # OpenMPTarget compilers give erroneous warnings about sign comparison in loops + IF(KOKKOS_ENABLE_OPENMPTARGET) + LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare") + ENDIF() + SET(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" ${COMMON_WARNINGS}) @@ -106,6 +113,12 @@ ENDIF() IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) SET(CUDA_ARCH_FLAG "--cuda-gpu-arch") GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda) + # Kokkos_CUDA_DIR has priority over CUDAToolkit_BIN_DIR + IF (Kokkos_CUDA_DIR) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) + ELSEIF(CUDAToolkit_BIN_DIR) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) + ENDIF() IF (KOKKOS_ENABLE_CUDA) SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE) ENDIF() @@ -167,6 +180,12 @@ IF (KOKKOS_ARCH_ARMV8_THUNDERX2) ) ENDIF() +IF (KOKKOS_ARCH_A64FX) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -march=armv8.2-a+sve + ) +ENDIF() + IF (KOKKOS_ARCH_ZEN) COMPILER_SPECIFIC_FLAGS( Intel -mavx2 @@ -327,6 +346,16 @@ IF (Kokkos_ENABLE_HIP) ENDIF() +IF (Kokkos_ENABLE_SYCL) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl + ) + COMPILER_SPECIFIC_OPTIONS( + DEFAULT -fsycl-unnamed-lambda + ) +ENDIF() + + SET(CUDA_ARCH_ALREADY_SPECIFIED "") FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) IF(KOKKOS_ARCH_${ARCH}) @@ -392,6 +421,7 @@ ENDFUNCTION() #to the corresponding flag name if ON CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25 CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60 +CHECK_AMDGPU_ARCH(VEGA908 gfx908) IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED) MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. " @@ -477,35 +507,53 @@ ENDIF() #CMake verbose is kind of pointless #Let's just always print things -MESSAGE(STATUS "Execution Spaces:") +MESSAGE(STATUS "Built-in Execution Spaces:") -FOREACH (_BACKEND CUDA OPENMPTARGET HIP) - IF(KOKKOS_ENABLE_${_BACKEND}) +FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL) + STRING(TOUPPER ${_BACKEND} UC_BACKEND) + IF(KOKKOS_ENABLE_${UC_BACKEND}) IF(_DEVICE_PARALLEL) MESSAGE(FATAL_ERROR "Multiple device parallel execution spaces are not allowed! " "Trying to enable execution space ${_BACKEND}, " "but execution space ${_DEVICE_PARALLEL} is already enabled. " "Remove the CMakeCache.txt file and re-configure.") ENDIF() - SET(_DEVICE_PARALLEL ${_BACKEND}) + IF (${_BACKEND} STREQUAL "Cuda") + IF(KOKKOS_ENABLE_CUDA_UVM) + SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}UVMSpace") + ELSE() + SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") + ENDIF() + SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + ELSE() + SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::Experimental::${_BACKEND}Space") + SET(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") + ENDIF() ENDIF() ENDFOREACH() IF(NOT _DEVICE_PARALLEL) - SET(_DEVICE_PARALLEL "NONE") + SET(_DEVICE_PARALLEL "NoTypeDefined") + SET(_DEFAULT_DEVICE_MEMSPACE "NoTypeDefined") ENDIF() MESSAGE(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") -UNSET(_DEVICE_PARALLEL) +IF(KOKKOS_ENABLE_PTHREAD) + SET(KOKKOS_ENABLE_THREADS ON) +ENDIF() - -FOREACH (_BACKEND OPENMP PTHREAD HPX) - IF(KOKKOS_ENABLE_${_BACKEND}) +FOREACH (_BACKEND OpenMP Threads HPX) + STRING(TOUPPER ${_BACKEND} UC_BACKEND) + IF(KOKKOS_ENABLE_${UC_BACKEND}) IF(_HOST_PARALLEL) MESSAGE(FATAL_ERROR "Multiple host parallel execution spaces are not allowed! " "Trying to enable execution space ${_BACKEND}, " "but execution space ${_HOST_PARALLEL} is already enabled. " "Remove the CMakeCache.txt file and re-configure.") ENDIF() - SET(_HOST_PARALLEL ${_BACKEND}) + IF (${_BACKEND} STREQUAL "HPX") + SET(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") + ELSE() + SET(_HOST_PARALLEL "Kokkos::${_BACKEND}") + ENDIF() ENDIF() ENDFOREACH() @@ -515,14 +563,11 @@ IF(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) "and Kokkos_ENABLE_SERIAL=OFF.") ENDIF() -IF(NOT _HOST_PARALLEL) - SET(_HOST_PARALLEL "NONE") -ENDIF() +IF(_HOST_PARALLEL) MESSAGE(STATUS " Host Parallel: ${_HOST_PARALLEL}") -UNSET(_HOST_PARALLEL) - -IF(KOKKOS_ENABLE_PTHREAD) - SET(KOKKOS_ENABLE_THREADS ON) +ELSE() + SET(_HOST_PARALLEL "NoTypeDefined") + MESSAGE(STATUS " Host Parallel: NoTypeDefined") ENDIF() IF(KOKKOS_ENABLE_SERIAL) diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index 4a77a94e07..e6600161f9 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -4,24 +4,42 @@ SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) -IF(Kokkos_ENABLE_CUDA) +MACRO(kokkos_internal_have_compiler_nvcc) # Check if the compiler is nvcc (which really means nvcc_wrapper). - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version + EXECUTE_PROCESS(COMMAND ${ARGN} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) - - - STRING(REGEX REPLACE "^ +" "" - INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) SET(INTERNAL_HAVE_COMPILER_NVCC true) ELSE() SET(INTERNAL_HAVE_COMPILER_NVCC false) ENDIF() +ENDMACRO() + +IF(Kokkos_ENABLE_CUDA) + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + # check if compiler was set to nvcc_wrapper + kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) + # if launcher was found and nvcc_wrapper was not specified as + # compiler, set to use launcher. Will ensure CMAKE_CXX_COMPILER + # is replaced by nvcc_wrapper + IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + # the first argument to launcher is always the C++ compiler defined by cmake + # if the second argument matches the C++ compiler, it forwards the rest of the + # args to nvcc_wrapper + kokkos_internal_have_compiler_nvcc( + ${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) + SET(INTERNAL_USE_COMPILER_LAUNCHER true) + ENDIF() ENDIF() IF(INTERNAL_HAVE_COMPILER_NVCC) @@ -36,6 +54,35 @@ IF(INTERNAL_HAVE_COMPILER_NVCC) STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") + IF(INTERNAL_USE_COMPILER_LAUNCHER) + IF(Kokkos_LAUNCH_COMPILER_INFO) + GET_FILENAME_COMPONENT(BASE_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME) + # does not have STATUS intentionally + MESSAGE("") + MESSAGE("Kokkos_LAUNCH_COMPILER_INFO (${Kokkos_COMPILE_LAUNCHER}):") + MESSAGE(" - Kokkos + CUDA backend requires the C++ files to be compiled as CUDA code.") + MESSAGE(" - kokkos_launch_compiler permits CMAKE_CXX_COMPILER to be set to a traditional C++ compiler when Kokkos_ENABLE_CUDA=ON") + MESSAGE(" by prefixing all the compile and link commands with the path to the script + CMAKE_CXX_COMPILER (${CMAKE_CXX_COMPILER}).") + MESSAGE(" - If any of the compile or link commands have CMAKE_CXX_COMPILER as the first argument, it replaces CMAKE_CXX_COMPILER with nvcc_wrapper.") + MESSAGE(" - If the compile or link command is not CMAKE_CXX_COMPILER, it just executes the command.") + MESSAGE(" - If using ccache, set CMAKE_CXX_COMPILER to nvcc_wrapper explicitly.") + MESSAGE(" - kokkos_compiler_launcher is available to downstream projects as well.") + MESSAGE(" - If CMAKE_CXX_COMPILER=nvcc_wrapper, all legacy behavior will be preserved during 'find_package(Kokkos)'") + MESSAGE(" - If CMAKE_CXX_COMPILER is not nvcc_wrapper, 'find_package(Kokkos)' will apply 'kokkos_compilation(GLOBAL)' unless separable compilation is enabled") + MESSAGE(" - This can be disabled via '-DKokkos_LAUNCH_COMPILER=OFF'") + MESSAGE(" - Use 'find_package(Kokkos COMPONENTS separable_compilation)' to enable separable compilation") + MESSAGE(" - Separable compilation allows you to control the scope of where the compiler transformation behavior (${BASE_COMPILER_NAME} -> nvcc_wrapper) is applied") + MESSAGE(" - The compiler transformation can be applied on a per-project, per-directory, per-target, and/or per-source-file basis") + MESSAGE(" - 'kokkos_compilation(PROJECT)' will apply the compiler transformation to all targets in a project/subproject") + MESSAGE(" - 'kokkos_compilation(TARGET [...])' will apply the compiler transformation to the specified target(s)") + MESSAGE(" - 'kokkos_compilation(SOURCE [...])' will apply the compiler transformation to the specified source file(s)") + MESSAGE(" - 'kokkos_compilation(DIRECTORY [...])' will apply the compiler transformation to the specified directories") + MESSAGE("") + ELSE() + MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled... Set Kokkos_LAUNCH_COMPILER_INFO=ON for more info.") + ENDIF() + kokkos_compilation(GLOBAL) + ENDIF() ENDIF() IF(Kokkos_ENABLE_HIP) @@ -90,38 +137,49 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) ENDIF() ENDIF() +IF(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) + # SET Fujitsus compiler version which is not detected by CMake + EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version + OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + + STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" + TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) +ENDIF() + # Enforce the minimum compilers supported by Kokkos. SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang 3.5.2 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 4.8.4 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 15.0.2 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 9.0.69 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 3.5.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n PGI 17.1 or higher\n") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang 4.0.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 5.3.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 17.0.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 9.2.88 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 3.8.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n PGI 17.4 or higher\n") IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.5.2) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.8.4) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 5.3.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.0.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 9.0.69) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 9.2.88) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.5.0) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.1) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.4) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_corner_cases.cmake b/lib/kokkos/cmake/kokkos_corner_cases.cmake index a792590bac..3962c4b16e 100644 --- a/lib/kokkos/cmake/kokkos_corner_cases.cmake +++ b/lib/kokkos/cmake/kokkos_corner_cases.cmake @@ -1,4 +1,4 @@ -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_ENABLE_OPENMP AND NOT KOKKOS_CLANG_IS_CRAY AND NOT "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") +IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_ENABLE_OPENMP AND NOT KOKKOS_CLANG_IS_CRAY AND NOT KOKKOS_COMPILER_CLANG_MSVC) # The clang "version" doesn't actually tell you what runtimes and tools # were built into Clang. We should therefore make sure that libomp # was actually built into Clang. Otherwise the user will get nonsensical diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake index 7d1c375ae6..41ee10a8a0 100644 --- a/lib/kokkos/cmake/kokkos_enable_devices.cmake +++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake @@ -25,6 +25,18 @@ IF (KOKKOS_ENABLE_PTHREAD) SET(KOKKOS_ENABLE_THREADS ON) ENDIF() +# detect clang++ / cl / clang-cl clashes +IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + # this specific test requires CMake >= 3.15 + IF ("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") + # use pure clang++ instead of clang-cl + SET(KOKKOS_COMPILER_CLANG_MSVC OFF) + ELSE() + # it defaults to clang-cl + SET(KOKKOS_COMPILER_CLANG_MSVC ON) + ENDIF() +ENDIF() + IF(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) SET(OMP_DEFAULT ON) ELSE() @@ -39,13 +51,16 @@ IF(KOKKOS_ENABLE_OPENMP) IF(KOKKOS_CLANG_IS_INTEL) SET(ClangOpenMPFlag -fiopenmp) ENDIF() - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - #expression /openmp yields error, so add a specific Clang flag - COMPILER_SPECIFIC_OPTIONS(Clang /clang:-fopenmp) - #link omp library from LLVM lib dir + IF(KOKKOS_COMPILER_CLANG_MSVC) + #for clang-cl expression /openmp yields an error, so directly add the specific Clang flag + SET(ClangOpenMPFlag /clang:-fopenmp=libomp) + ENDIF() + IF(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL Clang) + #link omp library from LLVM lib dir, no matter if it is clang-cl or clang++ get_filename_component(LLVM_BIN_DIR ${CMAKE_CXX_COMPILER_AR} DIRECTORY) COMPILER_SPECIFIC_LIBS(Clang "${LLVM_BIN_DIR}/../lib/libomp.lib") - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + ENDIF() + IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Clang -Xcompiler ${ClangOpenMPFlag} @@ -71,7 +86,7 @@ ENDIF() KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") IF (KOKKOS_ENABLE_OPENMPTARGET) -SET(ClangOpenMPFlag -fopenmp=libomp) + SET(ClangOpenMPFlag -fopenmp=libomp) IF(KOKKOS_CLANG_IS_CRAY) SET(ClangOpenMPFlag -fopenmp) ENDIF() @@ -105,9 +120,11 @@ KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend" IF (KOKKOS_ENABLE_CUDA) GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") - IF(WIN32) + IF(WIN32 AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS -x cu) ENDIF() +## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros + LIST(APPEND DEVICE_SETUP_LIST Cuda) ENDIF() # We want this to default to OFF for cache reasons, but if no @@ -128,3 +145,10 @@ KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial back KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") + +## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros +IF (KOKKOS_ENABLE_HIP) + LIST(APPEND DEVICE_SETUP_LIST HIP) +ENDIF() + +KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index 7ce3ed501e..2b17d648b4 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -154,13 +154,13 @@ MACRO(kokkos_export_imported_tpl NAME) KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) IF(TPL_LIBRARY) - KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION ${TPL_LIBRARY}") + KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") ENDIF() ENDIF() GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) IF(TPL_INCLUDES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES ${TPL_INCLUDES}") + KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") ENDIF() GET_TARGET_PROPERTY(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) @@ -178,7 +178,7 @@ MACRO(kokkos_export_imported_tpl NAME) GET_TARGET_PROPERTY(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) IF(TPL_LINK_LIBRARIES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES ${TPL_LINK_LIBRARIES}") + KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") ENDIF() KOKKOS_APPEND_CONFIG_LINE(")") KOKKOS_APPEND_CONFIG_LINE("ENDIF()") @@ -770,7 +770,7 @@ FUNCTION(kokkos_link_tpl TARGET) ENDFUNCTION() FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIP) + SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIP Fujitsu) CMAKE_PARSE_ARGUMENTS( PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" @@ -844,7 +844,6 @@ ENDFUNCTION(COMPILER_SPECIFIC_DEFS) FUNCTION(COMPILER_SPECIFIC_LIBS) COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES) ENDFUNCTION(COMPILER_SPECIFIC_LIBS) - # Given a list of the form # key1;value1;key2;value2,... # Create a list of all keys in a variable named ${KEY_LIST_NAME} @@ -877,3 +876,114 @@ FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS) ENDIF() ENDFOREACH() ENDFUNCTION() + +# this function checks whether the current CXX compiler supports building CUDA +FUNCTION(kokkos_cxx_compiler_cuda_test _VAR) + # don't run this test every time + IF(DEFINED ${_VAR}) + RETURN() + ENDIF() + + FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp +" +#include +#include + +__global__ +void kernel(int sz, double* data) +{ + auto _beg = blockIdx.x * blockDim.x + threadIdx.x; + for(int i = _beg; i < sz; ++i) + data[i] += static_cast(i); +} + +int main() +{ + double* data = nullptr; + int blocks = 64; + int grids = 64; + auto ret = cudaMalloc(&data, blocks * grids * sizeof(double)); + if(ret != cudaSuccess) + return EXIT_FAILURE; + kernel<<>>(blocks * grids, data); + cudaDeviceSynchronize(); + return EXIT_SUCCESS; +} +") + + TRY_COMPILE(_RET + ${PROJECT_BINARY_DIR}/compile_tests + SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) + + SET(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") +ENDFUNCTION() + +# this function is provided to easily select which files use nvcc_wrapper: +# +# GLOBAL --> all files +# TARGET --> all files in a target +# SOURCE --> specific source files +# DIRECTORY --> all files in directory +# PROJECT --> all files/targets in a project/subproject +# +FUNCTION(kokkos_compilation) + # check whether the compiler already supports building CUDA + KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) + # if CUDA compile test has already been performed, just return + IF(Kokkos_CXX_COMPILER_COMPILES_CUDA) + RETURN() + ENDIF() + + CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") + ENDIF() + + IF(COMP_GLOBAL) + # if global, don't bother setting others + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + ELSE() + FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + UNSET(COMP_${_TYPE}) + ENDIF() + # set the properties if defined + IF(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + ENDIF() + ENDFOREACH() + ENDIF() +ENDFUNCTION() +## KOKKOS_CONFIG_HEADER - parse the data list which is a list of backend names +## and create output config header file...used for +## creating dynamic include files based on enabled backends +## +## SRC_FILE is input file +## TARGET_FILE output file +## HEADER_GUARD TEXT used with include header guard +## HEADER_PREFIX prefix used with include (i.e. fwd, decl, setup) +## DATA_LIST list of backends to include in generated file +FUNCTION(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) + SET(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") + CONFIGURE_FILE(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) + FOREACH( BACKEND_NAME ${DATA_LIST} ) + SET(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> +\@INCLUDE_NEXT_FILE\@") + CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) + ENDFOREACH() + SET(INCLUDE_NEXT_FILE "" ) + CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) +ENDFUNCTION() diff --git a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake index cf14948f43..015873ebd6 100644 --- a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -1,19 +1,17 @@ # From CMake 3.10 documentation #This can run at any time -KOKKOS_OPTION(CXX_STANDARD "" STRING "The C++ standard for Kokkos to use: 11, 14, 17, or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 11") +KOKKOS_OPTION(CXX_STANDARD "" STRING "The C++ standard for Kokkos to use: 14, 17, or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 14") # Set CXX standard flags -SET(KOKKOS_ENABLE_CXX11 OFF) SET(KOKKOS_ENABLE_CXX14 OFF) SET(KOKKOS_ENABLE_CXX17 OFF) SET(KOKKOS_ENABLE_CXX20 OFF) IF (KOKKOS_CXX_STANDARD) IF (${KOKKOS_CXX_STANDARD} STREQUAL "c++98") - MESSAGE(FATAL_ERROR "Kokkos no longer supports C++98 - minimum C++11") + MESSAGE(FATAL_ERROR "Kokkos no longer supports C++98 - minimum C++14") ELSEIF (${KOKKOS_CXX_STANDARD} STREQUAL "c++11") - MESSAGE(WARNING "Deprecated Kokkos C++ standard set as 'c++11'. Use '11' instead.") - SET(KOKKOS_CXX_STANDARD "11") + MESSAGE(FATAL_ERROR "Kokkos no longer supports C++11 - minimum C++14") ELSEIF(${KOKKOS_CXX_STANDARD} STREQUAL "c++14") MESSAGE(WARNING "Deprecated Kokkos C++ standard set as 'c++14'. Use '14' instead.") SET(KOKKOS_CXX_STANDARD "14") @@ -33,8 +31,8 @@ IF (KOKKOS_CXX_STANDARD) ENDIF() IF (NOT KOKKOS_CXX_STANDARD AND NOT CMAKE_CXX_STANDARD) - MESSAGE(STATUS "Setting default Kokkos CXX standard to 11") - SET(KOKKOS_CXX_STANDARD "11") + MESSAGE(STATUS "Setting default Kokkos CXX standard to 14") + SET(KOKKOS_CXX_STANDARD "14") ELSEIF(NOT KOKKOS_CXX_STANDARD) MESSAGE(STATUS "Setting default Kokkos CXX standard to ${CMAKE_CXX_STANDARD}") SET(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake index cb857bc11e..1d7da922eb 100644 --- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -29,7 +29,7 @@ FUNCTION(kokkos_set_cxx_standard_feature standard) ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang)) - SET(SUPPORTED_NVCC_FLAGS "-std=c++11;-std=c++14;-std=c++17") + SET(SUPPORTED_NVCC_FLAGS "-std=c++14;-std=c++17") IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.") ENDIF() @@ -42,13 +42,16 @@ FUNCTION(kokkos_set_cxx_standard_feature standard) ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") + ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu")) + MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") + GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") ELSE() #nope, we can't do anything here - MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferrably including your CMake command.") + MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command.") GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") ENDIF() - IF(NOT WIN32) + IF((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported") ENDIF() @@ -65,11 +68,7 @@ IF (KOKKOS_CXX_STANDARD AND CMAKE_CXX_STANDARD) ENDIF() -IF (KOKKOS_CXX_STANDARD STREQUAL "11" ) - kokkos_set_cxx_standard_feature(11) - SET(KOKKOS_ENABLE_CXX11 ON) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "11") -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "14") +IF(KOKKOS_CXX_STANDARD STREQUAL "14") kokkos_set_cxx_standard_feature(14) SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Y") SET(KOKKOS_ENABLE_CXX14 ON) @@ -81,21 +80,21 @@ ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "20") kokkos_set_cxx_standard_feature(20) SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") SET(KOKKOS_ENABLE_CXX20 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "98") - MESSAGE(FATAL_ERROR "Kokkos requires C++11 or newer!") +ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "98" OR KOKKOS_CXX_STANDARD STREQUAL "11") + MESSAGE(FATAL_ERROR "Kokkos requires C++14 or newer!") ELSE() - MESSAGE(FATAL_ERROR "Unknown C++ standard ${KOKKOS_CXX_STANDARD} - must be 11, 14, 17, or 20") + MESSAGE(FATAL_ERROR "Unknown C++ standard ${KOKKOS_CXX_STANDARD} - must be 14, 17, or 20") ENDIF() # Enforce that extensions are turned off for nvcc_wrapper. # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's -# flags for turning on C++11. Since for compiler ID and versioning purposes +# flags for turning on C++14. Since for compiler ID and versioning purposes # CMake recognizes the host compiler when calling nvcc_wrapper, this just -# works. Both NVCC and nvcc_wrapper only recognize '-std=c++11' which means +# works. Both NVCC and nvcc_wrapper only recognize '-std=c++14' which means # that we can only use host compilers for CUDA builds that use those flags. -# It also means that extensions (gnu++11) can't be turned on for CUDA builds. +# It also means that extensions (gnu++14) can't be turned on for CUDA builds. IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) @@ -117,7 +116,7 @@ IF(KOKKOS_ENABLE_CUDA) MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") ENDIF() ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") + MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index 9d9be87834..b58d3696ea 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -76,3 +76,7 @@ STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable UNSET(KOKKOS_TPL_EXPORTS CACHE) SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) +IF (KOKKOS_ENABLE_MEMKIND) + SET(KOKKOS_ENABLE_HBWSPACE) + LIST(APPEND KOKKOS_MEMSPACE_LIST HBWSpace) +ENDIF() diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index 4bd186dac7..059fb192f0 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -6,6 +6,12 @@ INCLUDE(GNUInstallDirs) MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") +FUNCTION(VERIFY_EMPTY CONTEXT) + if(${ARGN}) + MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + endif() +ENDFUNCTION() + #Leave this here for now - but only do for tribits #This breaks the standalone CMake IF (KOKKOS_HAS_TRILINOS) @@ -135,28 +141,37 @@ FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) ENDFUNCTION() FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) +CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "SOURCES;CATEGORIES;ARGS" + ${ARGN}) +VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) + IF (KOKKOS_HAS_TRILINOS) + IF(DEFINED PARSE_ARGS) + STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") + ENDIF() TRIBITS_ADD_EXECUTABLE_AND_TEST( ${ROOT_NAME} + SOURCES ${PARSE_SOURCES} TESTONLYLIBS kokkos_gtest - ${ARGN} NUM_MPI_PROCS 1 COMM serial mpi + ARGS ${PARSE_ARGS} + CATEGORIES ${PARSE_CATEGORIES} + SOURCES ${PARSE_SOURCES} FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${PARSE_ARGS} ) ELSE() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES" - ${ARGN}) - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} SOURCES ${PARSE_SOURCES} ) KOKKOS_ADD_TEST(NAME ${ROOT_NAME} EXE ${ROOT_NAME} FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${PARSE_ARGS} ) ENDIF() ENDFUNCTION() @@ -219,6 +234,7 @@ MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) ${PARSE_UNPARSED_ARGUMENTS} TESTONLYLIBS kokkos_gtest ) + SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) ENDMACRO() MACRO(KOKKOS_PACKAGE_POSTPROCESS) @@ -227,6 +243,79 @@ MACRO(KOKKOS_PACKAGE_POSTPROCESS) endif() ENDMACRO() +## KOKKOS_CONFIGURE_CORE Configure/Generate header files for core content based +## on enabled backends. +## KOKKOS_FWD is the forward declare set +## KOKKOS_SETUP is included in Kokkos_Macros.hpp and include prefix includes/defines +## KOKKOS_DECLARE is the declaration set +## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp +MACRO(KOKKOS_CONFIGURE_CORE) + SET(FWD_BACKEND_LIST) + FOREACH(MEMSPACE ${KOKKOS_MEMSPACE_LIST}) + LIST(APPEND FWD_BACKEND_LIST ${MEMSPACE}) + ENDFOREACH() + FOREACH(BACKEND_ ${KOKKOS_ENABLED_DEVICES}) + IF( ${BACKEND_} STREQUAL "PTHREAD") + LIST(APPEND FWD_BACKEND_LIST THREADS) + ELSE() + LIST(APPEND FWD_BACKEND_LIST ${BACKEND_}) + ENDIF() + ENDFOREACH() + MESSAGE(STATUS "Kokkos Devices: ${KOKKOS_ENABLED_DEVICES}, Kokkos Backends: ${FWD_BACKEND_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${FWD_BACKEND_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${FWD_BACKEND_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_PostInclude.hpp "KOKKOS_POST_INCLUDE" "Kokkos_Post_Include" "${KOKKOS_BACKEND_POST_INCLUDE_LIST}") + SET(_DEFAULT_HOST_MEMSPACE "::Kokkos::HostSpace") + KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space") + KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space") + KOKKOS_OPTION(DEFAULT_DEVICE_EXECUTION_SPACE "" STRING "Override default device execution space") + KOKKOS_OPTION(DEFAULT_HOST_PARALLEL_EXECUTION_SPACE "" STRING "Override default host parallel execution space") + IF (NOT Kokkos_DEFAULT_DEVICE_EXECUTION_SPACE STREQUAL "") + SET(_DEVICE_PARALLEL ${Kokkos_DEFAULT_DEVICE_EXECUTION_SPACE}) + MESSAGE(STATUS "Override default device execution space: ${_DEVICE_PARALLEL}") + SET(KOKKOS_DEVICE_SPACE_ACTIVE ON) + ELSE() + IF (_DEVICE_PARALLEL STREQUAL "NoTypeDefined") + SET(KOKKOS_DEVICE_SPACE_ACTIVE OFF) + ELSE() + SET(KOKKOS_DEVICE_SPACE_ACTIVE ON) + ENDIF() + ENDIF() + IF (NOT Kokkos_DEFAULT_HOST_PARALLEL_EXECUTION_SPACE STREQUAL "") + SET(_HOST_PARALLEL ${Kokkos_DEFAULT_HOST_PARALLEL_EXECUTION_SPACE}) + MESSAGE(STATUS "Override default host parallel execution space: ${_HOST_PARALLEL}") + SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE ON) + ELSE() + IF (_HOST_PARALLEL STREQUAL "NoTypeDefined") + SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE OFF) + ELSE() + SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE ON) + ENDIF() + ENDIF() + #We are ready to configure the header + CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +ENDMACRO() + +## KOKKOS_INSTALL_ADDITIONAL_FILES - instruct cmake to install files in target destination. +## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, +## as well as other files provided through plugins. +MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) + # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to nvcc_wrapper + INSTALL(PROGRAMS + "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" + "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" + "${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler" + DESTINATION ${CMAKE_INSTALL_BINDIR}) + INSTALL(FILES + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +ENDMACRO() + FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) CMAKE_PARSE_ARGUMENTS(PARSE "PLAIN_STYLE" diff --git a/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake b/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake index a59868b73b..1ae4f19dd4 100644 --- a/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake index a4c55e1d7b..467635083f 100644 --- a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake index 4dc1a87e18..c78630b7f1 100644 --- a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake @@ -1,14 +1,16 @@ # @HEADER # ************************************************************************ # -# Trilinos: An Object-Oriented Solver Framework -# Copyright (2001) Sandia Corporation +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). # +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. # -# Copyright (2001) Sandia Corporation. Under the terms of Contract -# DE-AC04-94AL85000, there is a non-exclusive license for use of this -# work by or on behalf of the U.S. Government. Export of this program -# may require a license from the United States Government. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. @@ -21,10 +23,10 @@ # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR @@ -33,22 +35,7 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# NOTICE: The United States Government is granted for itself and others -# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide -# license in this data to reproduce, prepare derivative works, and -# perform publicly and display publicly. Beginning five (5) years from -# July 25, 2001, the United States Government is granted for itself and -# others acting on its behalf a paid-up, nonexclusive, irrevocable -# worldwide license in this data to reproduce, prepare derivative works, -# distribute copies to the public, perform publicly and display -# publicly, and to permit others to do so. -# -# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT -# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES -# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR -# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY -# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS -# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) # # ************************************************************************ # @HEADER diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt index 1011cb8fd1..43c66c24fd 100644 --- a/lib/kokkos/containers/performance_tests/CMakeLists.txt +++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt @@ -3,44 +3,26 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -IF(Kokkos_ENABLE_CUDA) - SET(SOURCES - TestMain.cpp - TestCuda.cpp - ) +foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) + # Because there is always an exception to the rule + if(Tag STREQUAL "Threads") + set(DEVICE "PTHREAD") + else() + string(TOUPPER ${Tag} DEVICE) + endif() + string(TOLOWER ${Tag} dir) - KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_Cuda - SOURCES ${SOURCES} - ) -ENDIF() + if(Kokkos_ENABLE_${DEVICE}) + message(STATUS "Sources Test${Tag}.cpp") -IF(Kokkos_ENABLE_PTHREAD) - SET(SOURCES - TestMain.cpp - TestThreads.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_Threads - SOURCES ${SOURCES} - ) -ENDIF() - -IF(Kokkos_ENABLE_OPENMP) - SET(SOURCES - TestMain.cpp - TestOpenMP.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_OpenMP - SOURCES ${SOURCES} - ) -ENDIF() - -IF(Kokkos_ENABLE_HPX) - SET(SOURCES - TestMain.cpp - TestHPX.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_HPX - SOURCES ${SOURCES} - ) -ENDIF() + set(SOURCES + TestMain.cpp + Test${Tag}.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_${Tag} + SOURCES ${SOURCES} + ) + endif() +endforeach() diff --git a/lib/kokkos/containers/performance_tests/Makefile b/lib/kokkos/containers/performance_tests/Makefile index 8ef1dd9938..cbb8490798 100644 --- a/lib/kokkos/containers/performance_tests/Makefile +++ b/lib/kokkos/containers/performance_tests/Makefile @@ -58,8 +58,8 @@ endif KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda -KokkosContainers_PerformanceTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_ROCm +KokkosContainers_PerformanceTest_HIP: $(OBJ_HIP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HIP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_HIP KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads @@ -73,8 +73,8 @@ KokkosContainers_PerformanceTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS) test-cuda: KokkosContainers_PerformanceTest_Cuda ./KokkosContainers_PerformanceTest_Cuda -test-rocm: KokkosContainers_PerformanceTest_ROCm - ./KokkosContainers_PerformanceTest_ROCm +test-hip: KokkosContainers_PerformanceTest_HIP + ./KokkosContainers_PerformanceTest_HIP test-threads: KokkosContainers_PerformanceTest_Threads ./KokkosContainers_PerformanceTest_Threads diff --git a/lib/kokkos/containers/performance_tests/TestCuda.cpp b/lib/kokkos/containers/performance_tests/TestCuda.cpp index 697a006c3c..8874590e2a 100644 --- a/lib/kokkos/containers/performance_tests/TestCuda.cpp +++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp @@ -43,7 +43,6 @@ */ #include -#if defined(KOKKOS_ENABLE_CUDA) #include #include @@ -66,23 +65,13 @@ namespace Performance { -class cuda : public ::testing::Test { - protected: - static void SetUpTestCase() { - std::cout << std::setprecision(5) << std::scientific; - Kokkos::InitArguments args(-1, -1, 0); - Kokkos::initialize(args); - } - static void TearDownTestCase() { Kokkos::finalize(); } -}; - -TEST_F(cuda, dynrankview_perf) { +TEST(TEST_CATEGORY, dynrankview_perf) { std::cout << "Cuda" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; test_dynrankview_op_perf(40960); } -TEST_F(cuda, global_2_local) { +TEST(TEST_CATEGORY, global_2_local) { std::cout << "Cuda" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -90,15 +79,12 @@ TEST_F(cuda, global_2_local) { test_global_to_local_ids(i); } -TEST_F(cuda, unordered_map_performance_near) { +TEST(TEST_CATEGORY, unordered_map_performance_near) { Perf::run_performance_tests("cuda-near"); } -TEST_F(cuda, unordered_map_performance_far) { +TEST(TEST_CATEGORY, unordered_map_performance_far) { Perf::run_performance_tests("cuda-far"); } } // namespace Performance -#else -void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTCUDA_PREVENT_EMPTY_LINK_ERROR() {} -#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ diff --git a/lib/kokkos/containers/performance_tests/TestROCm.cpp b/lib/kokkos/containers/performance_tests/TestHIP.cpp similarity index 67% rename from lib/kokkos/containers/performance_tests/TestROCm.cpp rename to lib/kokkos/containers/performance_tests/TestHIP.cpp index 55b770b49c..8033c76be6 100644 --- a/lib/kokkos/containers/performance_tests/TestROCm.cpp +++ b/lib/kokkos/containers/performance_tests/TestHIP.cpp @@ -43,7 +43,6 @@ */ #include -#if defined(KOKKOS_ENABLE_ROCM) #include #include @@ -66,46 +65,26 @@ namespace Performance { -class rocm : public ::testing::Test { - protected: - static void SetUpTestCase() { - std::cout << std::setprecision(5) << std::scientific; - Kokkos::HostSpace::execution_space::initialize(); - Kokkos::Experimental::ROCm::initialize( - Kokkos::Experimental::ROCm::SelectDevice(0)); - } - static void TearDownTestCase() { - Kokkos::Experimental::ROCm::finalize(); - Kokkos::HostSpace::execution_space::finalize(); - } -}; -#if 0 -// issue 1089 -TEST_F( rocm, dynrankview_perf ) -{ - std::cout << "ROCm" << std::endl; +TEST(TEST_CATEGORY, dynrankview_perf) { + std::cout << "HIP" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; - test_dynrankview_op_perf( 40960 ); + test_dynrankview_op_perf(40960); } -TEST_F( rocm, global_2_local) -{ - std::cout << "ROCm" << std::endl; +TEST(TEST_CATEGORY, global_2_local) { + std::cout << "HIP" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; - for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step) - test_global_to_local_ids(i); + for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; + i *= Performance::id_step) + test_global_to_local_ids(i); } -#endif -TEST_F(rocm, unordered_map_performance_near) { - Perf::run_performance_tests("rocm-near"); +TEST(TEST_CATEGORY, unordered_map_performance_near) { + Perf::run_performance_tests("hip-near"); } -TEST_F(rocm, unordered_map_performance_far) { - Perf::run_performance_tests("rocm-far"); +TEST(TEST_CATEGORY, unordered_map_performance_far) { + Perf::run_performance_tests("hip-far"); } } // namespace Performance -#else -void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTROCM_PREVENT_EMPTY_LINK_ERROR() {} -#endif /* #if defined( KOKKOS_ENABLE_ROCM ) */ diff --git a/lib/kokkos/containers/performance_tests/TestHPX.cpp b/lib/kokkos/containers/performance_tests/TestHPX.cpp index 48be466bfa..f229901dcc 100644 --- a/lib/kokkos/containers/performance_tests/TestHPX.cpp +++ b/lib/kokkos/containers/performance_tests/TestHPX.cpp @@ -43,7 +43,6 @@ */ #include -#if defined(KOKKOS_ENABLE_HPX) #include @@ -64,25 +63,13 @@ namespace Performance { -class hpx : public ::testing::Test { - protected: - static void SetUpTestCase() { - std::cout << std::setprecision(5) << std::scientific; - - Kokkos::initialize(); - Kokkos::print_configuration(std::cout); - } - - static void TearDownTestCase() { Kokkos::finalize(); } -}; - -TEST_F(hpx, dynrankview_perf) { +TEST(TEST_CATEGORY, dynrankview_perf) { std::cout << "HPX" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; test_dynrankview_op_perf(8192); } -TEST_F(hpx, global_2_local) { +TEST(TEST_CATEGORY, global_2_local) { std::cout << "HPX" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -90,7 +77,7 @@ TEST_F(hpx, global_2_local) { test_global_to_local_ids(i); } -TEST_F(hpx, unordered_map_performance_near) { +TEST(TEST_CATEGORY, unordered_map_performance_near) { unsigned num_hpx = 4; std::ostringstream base_file_name; base_file_name << "hpx-" << num_hpx << "-near"; @@ -98,7 +85,7 @@ TEST_F(hpx, unordered_map_performance_near) { base_file_name.str()); } -TEST_F(hpx, unordered_map_performance_far) { +TEST(TEST_CATEGORY, unordered_map_performance_far) { unsigned num_hpx = 4; std::ostringstream base_file_name; base_file_name << "hpx-" << num_hpx << "-far"; @@ -106,7 +93,7 @@ TEST_F(hpx, unordered_map_performance_far) { base_file_name.str()); } -TEST_F(hpx, scatter_view) { +TEST(TEST_CATEGORY, scatter_view) { std::cout << "ScatterView data-duplicated test:\n"; Perf::test_scatter_view #include -#include +#include int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); + + int result = RUN_ALL_TESTS(); + Kokkos::finalize(); + return result; } diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp index a9c8639ed4..f414b0d828 100644 --- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp +++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp @@ -43,7 +43,6 @@ */ #include -#if defined(KOKKOS_ENABLE_OPENMP) #include @@ -64,25 +63,13 @@ namespace Performance { -class openmp : public ::testing::Test { - protected: - static void SetUpTestCase() { - std::cout << std::setprecision(5) << std::scientific; - - Kokkos::initialize(); - Kokkos::OpenMP::print_configuration(std::cout); - } - - static void TearDownTestCase() { Kokkos::finalize(); } -}; - -TEST_F(openmp, dynrankview_perf) { +TEST(TEST_CATEGORY, dynrankview_perf) { std::cout << "OpenMP" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; test_dynrankview_op_perf(8192); } -TEST_F(openmp, global_2_local) { +TEST(TEST_CATEGORY, global_2_local) { std::cout << "OpenMP" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -90,7 +77,7 @@ TEST_F(openmp, global_2_local) { test_global_to_local_ids(i); } -TEST_F(openmp, unordered_map_performance_near) { +TEST(TEST_CATEGORY, unordered_map_performance_near) { unsigned num_openmp = 4; if (Kokkos::hwloc::available()) { num_openmp = Kokkos::hwloc::get_available_numa_count() * @@ -102,7 +89,7 @@ TEST_F(openmp, unordered_map_performance_near) { Perf::run_performance_tests(base_file_name.str()); } -TEST_F(openmp, unordered_map_performance_far) { +TEST(TEST_CATEGORY, unordered_map_performance_far) { unsigned num_openmp = 4; if (Kokkos::hwloc::available()) { num_openmp = Kokkos::hwloc::get_available_numa_count() * @@ -114,7 +101,7 @@ TEST_F(openmp, unordered_map_performance_far) { Perf::run_performance_tests(base_file_name.str()); } -TEST_F(openmp, scatter_view) { +TEST(TEST_CATEGORY, scatter_view) { std::cout << "ScatterView data-duplicated test:\n"; Perf::test_scatter_view -#if defined(KOKKOS_ENABLE_THREADS) #include @@ -65,34 +64,13 @@ namespace Performance { -class threads : public ::testing::Test { - protected: - static void SetUpTestCase() { - std::cout << std::setprecision(5) << std::scientific; - - unsigned num_threads = 4; - - if (Kokkos::hwloc::available()) { - num_threads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } - - std::cout << "Threads: " << num_threads << std::endl; - - Kokkos::initialize(Kokkos::InitArguments(num_threads)); - } - - static void TearDownTestCase() { Kokkos::finalize(); } -}; - -TEST_F(threads, dynrankview_perf) { +TEST(threads, dynrankview_perf) { std::cout << "Threads" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; test_dynrankview_op_perf(8192); } -TEST_F(threads, global_2_local) { +TEST(threads, global_2_local) { std::cout << "Threads" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -100,7 +78,7 @@ TEST_F(threads, global_2_local) { test_global_to_local_ids(i); } -TEST_F(threads, unordered_map_performance_near) { +TEST(threads, unordered_map_performance_near) { unsigned num_threads = 4; if (Kokkos::hwloc::available()) { num_threads = Kokkos::hwloc::get_available_numa_count() * @@ -112,7 +90,7 @@ TEST_F(threads, unordered_map_performance_near) { Perf::run_performance_tests(base_file_name.str()); } -TEST_F(threads, unordered_map_performance_far) { +TEST(threads, unordered_map_performance_far) { unsigned num_threads = 4; if (Kokkos::hwloc::available()) { num_threads = Kokkos::hwloc::get_available_numa_count() * @@ -125,8 +103,3 @@ TEST_F(threads, unordered_map_performance_far) { } } // namespace Performance - -#else -void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTTHREADS_PREVENT_EMPTY_LINK_ERROR() { -} -#endif diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp index eedfd5f9ef..ea1d6dde5d 100644 --- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp @@ -74,7 +74,7 @@ template class Bitset { public: using execution_space = Device; - using size_type = unsigned; + using size_type = unsigned int; enum { BIT_SCAN_REVERSE = 1u }; enum { MOVE_HINT_BACKWARD = 2u }; @@ -309,7 +309,7 @@ template class ConstBitset { public: using execution_space = Device; - using size_type = unsigned; + using size_type = unsigned int; private: enum { block_size = static_cast(sizeof(unsigned) * CHAR_BIT) }; diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index 3fc0371c69..6b6837f82c 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -162,7 +162,7 @@ class DualView : public ViewTraits { /// \brief The type of a const, random-access View host mirror of /// \c t_dev_const_randomread. using t_host_const_randomread_um = - typename t_dev_const_randomread::HostMirror; + typename t_dev_const_randomread_um::HostMirror; //@} //! \name Counters to keep track of changes ("modified" flags) @@ -245,21 +245,6 @@ class DualView : public ViewTraits { h_view(create_mirror_view(d_view)) // without UVM, host View mirrors {} - explicit inline DualView(const ViewAllocateWithoutInitializing& arg_prop, - const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : DualView(Impl::ViewCtorProp( - arg_prop.label, Kokkos::WithoutInitializing), - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, - arg_N7) {} - //! Copy constructor (shallow copy) template DualView(const DualView& src) @@ -457,7 +442,21 @@ class DualView : public ViewTraits { } return dev; } - + static constexpr const int view_header_size = 128; + void impl_report_host_sync() const noexcept { + Kokkos::Tools::syncDualView( + h_view.label(), + reinterpret_cast(reinterpret_cast(h_view.data()) - + view_header_size), + false); + } + void impl_report_device_sync() const noexcept { + Kokkos::Tools::syncDualView( + d_view.label(), + reinterpret_cast(reinterpret_cast(d_view.data()) - + view_header_size), + true); + } /// \brief Update data on device or host only if data in the other /// space has been marked as modified. /// @@ -499,6 +498,7 @@ class DualView : public ViewTraits { deep_copy(d_view, h_view); modified_flags(0) = modified_flags(1) = 0; + impl_report_device_sync(); } } if (dev == 0) { // hopefully Device is the same as DualView's host type @@ -515,6 +515,7 @@ class DualView : public ViewTraits { deep_copy(h_view, d_view); modified_flags(0) = modified_flags(1) = 0; + impl_report_host_sync(); } } if (std::is_same { Impl::throw_runtime_exception( "Calling sync on a DualView with a const datatype."); } + impl_report_device_sync(); } if (dev == 0) { // hopefully Device is the same as DualView's host type if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) { Impl::throw_runtime_exception( "Calling sync on a DualView with a const datatype."); } + impl_report_host_sync(); } } @@ -567,6 +570,7 @@ class DualView : public ViewTraits { deep_copy(h_view, d_view); modified_flags(1) = modified_flags(0) = 0; + impl_report_host_sync(); } } @@ -589,6 +593,7 @@ class DualView : public ViewTraits { deep_copy(d_view, h_view); modified_flags(1) = modified_flags(0) = 0; + impl_report_device_sync(); } } @@ -619,7 +624,20 @@ class DualView : public ViewTraits { if (modified_flags.data() == nullptr) return false; return modified_flags(1) < modified_flags(0); } - + void impl_report_device_modification() { + Kokkos::Tools::modifyDualView( + d_view.label(), + reinterpret_cast(reinterpret_cast(d_view.data()) - + view_header_size), + true); + } + void impl_report_host_modification() { + Kokkos::Tools::modifyDualView( + h_view.label(), + reinterpret_cast(reinterpret_cast(h_view.data()) - + view_header_size), + false); + } /// \brief Mark data as modified on the given device \c Device. /// /// If \c Device is the same as this DualView's device type, then @@ -636,6 +654,7 @@ class DualView : public ViewTraits { (modified_flags(1) > modified_flags(0) ? modified_flags(1) : modified_flags(0)) + 1; + impl_report_device_modification(); } if (dev == 0) { // hopefully Device is the same as DualView's host type // Increment the host's modified count. @@ -643,6 +662,7 @@ class DualView : public ViewTraits { (modified_flags(1) > modified_flags(0) ? modified_flags(1) : modified_flags(0)) + 1; + impl_report_host_modification(); } #ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK @@ -663,6 +683,7 @@ class DualView : public ViewTraits { (modified_flags(1) > modified_flags(0) ? modified_flags(1) : modified_flags(0)) + 1; + impl_report_host_modification(); #ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK if (modified_flags(0) && modified_flags(1)) { std::string msg = "Kokkos::DualView::modify_host ERROR: "; @@ -682,6 +703,7 @@ class DualView : public ViewTraits { (modified_flags(1) > modified_flags(0) ? modified_flags(1) : modified_flags(0)) + 1; + impl_report_device_modification(); #ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK if (modified_flags(0) && modified_flags(1)) { std::string msg = "Kokkos::DualView::modify_device ERROR: "; diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index afb4b682c4..c66d7a5f36 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -245,10 +245,13 @@ KOKKOS_INLINE_FUNCTION bool dyn_rank_view_verify_operator_bounds( return (size_t(i) < map.extent(R)) && dyn_rank_view_verify_operator_bounds(rank, map, args...); } else if (i != 0) { + // FIXME_SYCL SYCL doesn't allow printf in kernels +#ifndef KOKKOS_ENABLE_SYCL printf( "DynRankView Debug Bounds Checking Error: at rank %u\n Extra " "arguments beyond the rank must be zero \n", R); +#endif return (false) && dyn_rank_view_verify_operator_bounds(rank, map, args...); } else { @@ -1264,33 +1267,6 @@ class DynRankView : public ViewTraits { typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} - // For backward compatibility - // NDE This ctor does not take ViewCtorProp argument - should not use - // alternative createLayout call - explicit inline DynRankView(const ViewAllocateWithoutInitializing& arg_prop, - const typename traits::array_layout& arg_layout) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - arg_prop.label, Kokkos::WithoutInitializing), - arg_layout) {} - - explicit inline DynRankView(const ViewAllocateWithoutInitializing& arg_prop, - const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - arg_prop.label, Kokkos::WithoutInitializing), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) {} - //---------------------------------------- // Memory span required to wrap these dimensions. static constexpr size_t required_allocation_size( @@ -1401,7 +1377,7 @@ struct DynRankSubviewTag {}; namespace Impl { template -struct ViewMapping< +class ViewMapping< typename std::enable_if< (std::is_same::value && (std::is_same::view_type; std::string label = name.empty() ? src.label() : name; - auto mirror = Mirror(Kokkos::ViewAllocateWithoutInitializing(label), + auto mirror = Mirror(view_alloc(WithoutInitializing, label), Impl::reconstructLayout(src.layout(), src.rank())); deep_copy(mirror, src); return mirror; diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 9233499bf4..4fd084338e 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -1940,7 +1940,7 @@ create_mirror( const Kokkos::Experimental::OffsetView& src, typename std::enable_if< !std::is_same::array_layout, - Kokkos::LayoutStride>::value>::type* = 0) { + Kokkos::LayoutStride>::value>::type* = nullptr) { using src_type = Experimental::OffsetView; using dst_type = typename src_type::HostMirror; @@ -1960,7 +1960,7 @@ create_mirror( const Kokkos::Experimental::OffsetView& src, typename std::enable_if< std::is_same::array_layout, - Kokkos::LayoutStride>::value>::type* = 0) { + Kokkos::LayoutStride>::value>::type* = nullptr) { using src_type = Experimental::OffsetView; using dst_type = typename src_type::HostMirror; @@ -2028,7 +2028,7 @@ create_mirror_view( std::is_same< typename Kokkos::Experimental::OffsetView::data_type, typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value)>::type* = 0) { + T, P...>::HostMirror::data_type>::value)>::type* = nullptr) { return Kokkos::create_mirror(src); } @@ -2038,7 +2038,7 @@ typename Kokkos::Impl::MirrorOffsetViewType::view_type create_mirror_view(const Space&, const Kokkos::Experimental::OffsetView& src, typename std::enable_if::is_same_memspace>::type* = 0) { + Space, T, P...>::is_same_memspace>::type* = nullptr) { return src; } @@ -2048,7 +2048,7 @@ typename Kokkos::Impl::MirrorOffsetViewType::view_type create_mirror_view(const Space&, const Kokkos::Experimental::OffsetView& src, typename std::enable_if::is_same_memspace>::type* = 0) { + Space, T, P...>::is_same_memspace>::type* = nullptr) { return typename Kokkos::Impl::MirrorOffsetViewType::view_type( src.label(), src.layout(), {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), @@ -2063,7 +2063,7 @@ create_mirror_view(const Space&, // , std::string const& name = "" // , typename // std::enable_if::is_same_memspace>::type* = 0 ) { +// ...>::is_same_memspace>::type* = nullptr) { // (void)name; // return src; // } @@ -2076,11 +2076,11 @@ create_mirror_view(const Space&, // , std::string const& name = "" // , typename // std::enable_if::is_same_memspace>::type* = 0 ) { +// ...>::is_same_memspace>::type* = nullptr) { // using Mirror = typename // Kokkos::Experimental::Impl::MirrorViewType::view_type; // std::string label = name.empty() ? src.label() : name; -// auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout(), +// auto mirror = Mirror(view_alloc(WithoutInitializing, label), src.layout(), // { src.begin(0), src.begin(1), src.begin(2), // src.begin(3), src.begin(4), // src.begin(5), src.begin(6), src.begin(7) }); diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp index 3df0dfcd3b..5e18f5a80e 100644 --- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -206,6 +206,23 @@ struct DefaultContribution +struct DefaultDuplication { + using type = Kokkos::Experimental::ScatterNonDuplicated; +}; +template <> +struct DefaultContribution { + using type = Kokkos::Experimental::ScatterAtomic; +}; +template <> +struct DefaultContribution { + using type = Kokkos::Experimental::ScatterAtomic; +}; +#endif + // FIXME All these scatter values need overhaul: // - like should they be copyable at all? // - what is the internal handle type @@ -636,19 +653,10 @@ struct ReduceDuplicatesBase { size_t stride_in, size_t start_in, size_t n_in, std::string const& name) : src(src_in), dst(dest_in), stride(stride_in), start(start_in), n(n_in) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor(std::string("reduce_") + name, 0, - &kpID); - } - using policy_type = RangePolicy; - using closure_type = Kokkos::Impl::ParallelFor; - const closure_type closure(*(static_cast(this)), - policy_type(0, stride)); - closure.execute(); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } + parallel_for( + std::string("Kokkos::ScatterView::ReduceDuplicates [") + name + "]", + RangePolicy(0, stride), + static_cast(*this)); } }; @@ -682,19 +690,10 @@ struct ResetDuplicatesBase { ResetDuplicatesBase(ValueType* data_in, size_t size_in, std::string const& name) : data(data_in) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor(std::string("reduce_") + name, 0, - &kpID); - } - using policy_type = RangePolicy; - using closure_type = Kokkos::Impl::ParallelFor; - const closure_type closure(*(static_cast(this)), - policy_type(0, size_in)); - closure.execute(); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } + parallel_for( + std::string("Kokkos::ScatterView::ResetDuplicates [") + name + "]", + RangePolicy(0, size_in), + static_cast(*this)); } }; @@ -931,8 +930,8 @@ class ScatterView const& original_view) : unique_token(), internal_view( - Kokkos::ViewAllocateWithoutInitializing(std::string("duplicated_") + - original_view.label()), + view_alloc(WithoutInitializing, + std::string("duplicated_") + original_view.label()), unique_token.size(), original_view.rank_dynamic > 0 ? original_view.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -955,7 +954,7 @@ class ScatterView ScatterView(std::string const& name, Dims... dims) - : internal_view(Kokkos::ViewAllocateWithoutInitializing(name), + : internal_view(view_alloc(WithoutInitializing, name), unique_token.size(), dims...) { reset(); } @@ -1094,8 +1093,8 @@ class ScatterView\n" "#include \n" ) + configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() + list(REMOVE_ITEM UnitTestSources + ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Bitset.cpp + ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_ScatterView.cpp + ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_UnorderedMap.cpp + ) KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() diff --git a/lib/kokkos/containers/unit_tests/Makefile b/lib/kokkos/containers/unit_tests/Makefile index 308b5aa8b5..f42b9b7519 100644 --- a/lib/kokkos/containers/unit_tests/Makefile +++ b/lib/kokkos/containers/unit_tests/Makefile @@ -7,7 +7,7 @@ vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/openmp vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/hpx vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/serial vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/threads -vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/rocm +vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/hip vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/cuda vpath %.cpp ${CURDIR} default: build_all diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp index ae5b746f94..531caf0f85 100644 --- a/lib/kokkos/containers/unit_tests/TestDualView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -108,7 +108,7 @@ struct test_dualview_combinations { if (with_init) { a = ViewType("A", n, m); } else { - a = ViewType(Kokkos::ViewAllocateWithoutInitializing("A"), n, m); + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); } Kokkos::deep_copy(a.d_view, 1); @@ -404,14 +404,19 @@ void test_dualview_resize() { Impl::test_dualview_resize(); } +// FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dualview_combination) { test_dualview_combinations(10, true); } +#endif TEST(TEST_CATEGORY, dualview_alloc) { test_dualview_alloc(10); } +// FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dualview_combinations_without_init) { test_dualview_combinations(10, false); } @@ -428,6 +433,7 @@ TEST(TEST_CATEGORY, dualview_realloc) { TEST(TEST_CATEGORY, dualview_resize) { test_dualview_resize(); } +#endif } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp index 97155d3047..dd0199ed81 100644 --- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -1063,8 +1063,8 @@ class TestDynViewAPI { (void)thing; } - dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"), - 10, 20); + dView0 d_uninitialized( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "uninit"), 10, 20); ASSERT_TRUE(d_uninitialized.data() != nullptr); ASSERT_EQ(d_uninitialized.rank(), 2); ASSERT_EQ(d_uninitialized.extent(0), 10); @@ -1532,7 +1532,7 @@ class TestDynViewAPI { ASSERT_EQ(ds5.extent(5), ds5plus.extent(5)); #if (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_UVM)) && \ - !defined(KOKKOS_ENABLE_HIP) + !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) ASSERT_EQ(&ds5(1, 1, 1, 1, 0) - &ds5plus(1, 1, 1, 1, 0), 0); ASSERT_EQ(&ds5(1, 1, 1, 1, 0, 0) - &ds5plus(1, 1, 1, 1, 0, 0), 0); // passing argument to rank beyond the view's rank is allowed diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index f018793dd6..4b9f994417 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -243,6 +243,8 @@ struct TestDynamicView { } }; +// FIXME_SYCL needs resize_serial +#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dynamic_view) { using TestDynView = TestDynamicView; @@ -250,6 +252,7 @@ TEST(TEST_CATEGORY, dynamic_view) { TestDynView::run(100000 + 100 * i); } } +#endif } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp index e5186e3e1e..802813b13b 100644 --- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -95,10 +95,6 @@ void test_offsetview_construction() { ASSERT_EQ(ov.extent(1), 5); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - const int ovmin0 = ov.begin(0); - const int ovend0 = ov.end(0); - const int ovmin1 = ov.begin(1); - const int ovend1 = ov.end(1); { Kokkos::Experimental::OffsetView offsetV1("OneDOffsetView", range0); @@ -134,6 +130,13 @@ void test_offsetview_construction() { } } + // FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL + const int ovmin0 = ov.begin(0); + const int ovend0 = ov.end(0); + const int ovmin1 = ov.begin(1); + const int ovend1 = ov.end(1); + using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -175,6 +178,7 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; +#endif #endif { @@ -211,6 +215,8 @@ void test_offsetview_construction() { point3_type{{extent0, extent1, extent2}}); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + // FIXME_SYCL requires MDRange policy +#ifdef KOKKOS_ENABLE_SYCL int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -233,6 +239,7 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; +#endif #endif } view_type viewFromOV = ov.view(); @@ -259,6 +266,8 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, ov); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + // FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -268,6 +277,7 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; +#endif #endif } @@ -278,6 +288,8 @@ void test_offsetview_construction() { Kokkos::deep_copy(ov, aView); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + // FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -287,6 +299,7 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; +#endif #endif } } @@ -458,6 +471,8 @@ void test_offsetview_subview() { ASSERT_EQ(offsetSubview.end(1), 9); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + // FIXME_SYCL requires MDRange policy +#ifndef KOKKOS_ENABLE_SYCL using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -483,6 +498,7 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); +#endif #endif } @@ -685,9 +701,12 @@ void test_offsetview_offsets_rank3() { } #endif +// FIXME_SYCL needs MDRangePolicy +#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); } +#endif TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction(); diff --git a/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp b/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp new file mode 100644 index 0000000000..51fd3fc911 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SYCL_HPP +#define KOKKOS_TEST_SYCL_HPP + +#define TEST_CATEGORY sycl +#define TEST_EXECSPACE Kokkos::Experimental::SYCL + +#endif diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp index 4ec83baece..3a3cb607a6 100644 --- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp @@ -583,18 +583,9 @@ struct TestDuplicatedScatterView< }; #endif -#ifdef KOKKOS_ENABLE_ROCM -// disable duplicated instantiation with ROCm until -// UniqueToken can support it -template -struct TestDuplicatedScatterView { - TestDuplicatedScatterView(int) {} -}; -#endif - template -void test_scatter_view(int n) { +void test_scatter_view(int64_t n) { using execution_space = typename DeviceType::execution_space; // no atomics or duplication is only sensible if the execution space @@ -630,7 +621,7 @@ void test_scatter_view(int n) { constexpr std::size_t bytes_per_value = sizeof(NumberType) * 12; std::size_t const maximum_allowed_copy_values = maximum_allowed_copy_bytes / bytes_per_value; - n = std::min(n, int(maximum_allowed_copy_values)); + n = std::min(n, int64_t(maximum_allowed_copy_values)); // if the default is duplicated, this needs to follow the limit { @@ -683,32 +674,40 @@ TEST(TEST_CATEGORY, scatterview_devicetype) { test_scatter_view(10); test_scatter_view(10); +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - using cuda_device_type = Kokkos::Device; - test_scatter_view::value) { + using device_device_type = + Kokkos::Device; + test_scatter_view(10); - test_scatter_view(10); - test_scatter_view(10); - test_scatter_view(10); - test_scatter_view(10); - using cudauvm_device_type = - Kokkos::Device; - test_scatter_view( + 10); + test_scatter_view(10); + test_scatter_view(10); + using host_device_type = + Kokkos::Device; + test_scatter_view(10); - test_scatter_view(10); - test_scatter_view( - 10); - test_scatter_view( - 10); - test_scatter_view( - 10); + test_scatter_view(10); + test_scatter_view(10); + test_scatter_view(10); } #endif } } // namespace Test -#endif // KOKKOS_TEST_UNORDERED_MAP_HPP +#endif // KOKKOS_TEST_SCATTER_VIEW_HPP diff --git a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp index 89c69756d8..8bb267ce5d 100644 --- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp +++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp @@ -200,8 +200,7 @@ void run_test_graph3(size_t B, size_t N) { for (size_t i = 0; i < B; i++) { size_t ne = 0; - for (size_t j = hx.row_block_offsets(i); j < hx.row_block_offsets(i + 1); - j++) + for (auto j = hx.row_block_offsets(i); j < hx.row_block_offsets(i + 1); j++) ne += hx.row_map(j + 1) - hx.row_map(j) + C; ASSERT_FALSE( @@ -212,7 +211,7 @@ void run_test_graph3(size_t B, size_t N) { template void run_test_graph4() { - using ordinal_type = unsigned; + using ordinal_type = unsigned int; using layout_type = Kokkos::LayoutRight; using space_type = Space; using memory_traits_type = Kokkos::MemoryUnmanaged; @@ -286,7 +285,10 @@ void run_test_graph4() { TEST(TEST_CATEGORY, staticcrsgraph) { TestStaticCrsGraph::run_test_graph(); + // FIXME_SYCL requires MDRangePolicy +#ifndef KOKKOS_ENABLE_SYCL TestStaticCrsGraph::run_test_graph2(); +#endif TestStaticCrsGraph::run_test_graph3(1, 0); TestStaticCrsGraph::run_test_graph3(1, 1000); TestStaticCrsGraph::run_test_graph3(1, 10000); diff --git a/lib/kokkos/containers/unit_tests/TestVector.hpp b/lib/kokkos/containers/unit_tests/TestVector.hpp index 296b9a7e64..33b265e077 100644 --- a/lib/kokkos/containers/unit_tests/TestVector.hpp +++ b/lib/kokkos/containers/unit_tests/TestVector.hpp @@ -78,7 +78,7 @@ struct test_vector_insert { // Looks like some std::vector implementations do not have the restriction // right on the overload taking three iterators, and thus the following call // will hit that overload and then fail to compile. -#if defined(KOKKOS_COMPILER_INTEL) && (1700 > KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) // And at least GCC 4.8.4 doesn't implement vector insert correct for C++11 // Return type is void ... #if (__GNUC__ < 5) @@ -104,7 +104,7 @@ struct test_vector_insert { // Looks like some std::vector implementations do not have the restriction // right on the overload taking three iterators, and thus the following call // will hit that overload and then fail to compile. -#if defined(KOKKOS_COMPILER_INTEL) && (1700 > KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) b.insert(b.begin(), typename Vector::size_type(7), 9); #else b.insert(b.begin(), 7, 9); @@ -125,7 +125,7 @@ struct test_vector_insert { // Testing insert at end via all three function interfaces a.insert(a.end(), 11); -#if defined(KOKKOS_COMPILER_INTEL) && (1700 > KOKKOS_COMPILER_INTEL) +#if defined(KOKKOS_COMPILER_INTEL) a.insert(a.end(), typename Vector::size_type(2), 12); #else a.insert(a.end(), 2, 12); diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in index e930f6a05e..f0835772b8 100644 --- a/lib/kokkos/core/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in @@ -100,6 +100,5 @@ // TODO: No longer options in Kokkos. Need to be removed. #cmakedefine KOKKOS_USING_DEPRECATED_VIEW -#cmakedefine KOKKOS_ENABLE_CXX11 #endif // !defined(KOKKOS_FOR_SIERRA) diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index f55721e04a..b7b817c910 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -48,17 +48,10 @@ SET(SOURCES PerfTest_ViewResize_8.cpp ) -IF(Kokkos_ENABLE_HIP) -# FIXME HIP requires TeamPolicy - LIST(REMOVE_ITEM SOURCES - PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - ) -ENDIF() - IF(Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction LIST(REMOVE_ITEM SOURCES + PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp ) @@ -75,7 +68,8 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) # This test currently times out for MSVC -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") +# FIXME_SYCL these tests don't compile yet (require parallel_for). +IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT Kokkos_ENABLE_SYCL) KOKKOS_ADD_EXECUTABLE_AND_TEST( PerfTestExec SOURCES ${SOURCES} @@ -83,17 +77,28 @@ IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") ) ENDIF() -KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Atomic - SOURCES test_atomic.cpp - CATEGORIES PERFORMANCE -) +# FIXME_SYCL +IF(NOT Kokkos_ENABLE_SYCL) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Atomic + SOURCES test_atomic.cpp + CATEGORIES PERFORMANCE + ) + +IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Atomic_MinMax + SOURCES test_atomic_minmax_simple.cpp + CATEGORIES PERFORMANCE + ) +ENDIF() KOKKOS_ADD_EXECUTABLE_AND_TEST( PerformanceTest_Mempool SOURCES test_mempool.cpp CATEGORIES PERFORMANCE ) +ENDIF() IF(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET needs tasking diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile index 6d619dc573..ac06c89757 100644 --- a/lib/kokkos/core/perf_test/Makefile +++ b/lib/kokkos/core/perf_test/Makefile @@ -65,6 +65,12 @@ TEST_TARGETS += test-taskdag # +OBJ_ATOMICS_MINMAX = test_atomic_minmax_simple.o +TARGETS += KokkosCore_PerformanceTest_Atomics_MinMax +TEST_TARGETS += test-atomic-minmax + +# + KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest @@ -77,6 +83,9 @@ KokkosCore_PerformanceTest_Mempool: $(OBJ_MEMPOOL) $(KOKKOS_LINK_DEPENDS) KokkosCore_PerformanceTest_TaskDAG: $(OBJ_TASKDAG) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_TASKDAG) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_TaskDAG +KokkosCore_PerformanceTest_Atomics_MinMax: $(OBJ_ATOMICS_MINMAX) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_ATOMICS_MINMAX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest_Atomics_MinMax + test-performance: KokkosCore_PerformanceTest ./KokkosCore_PerformanceTest @@ -89,6 +98,9 @@ test-mempool: KokkosCore_PerformanceTest_Mempool test-taskdag: KokkosCore_PerformanceTest_TaskDAG ./KokkosCore_PerformanceTest_TaskDAG +test-atomic-minmax: KokkosCore_PerformanceTest_Atomics_MinMax + ./KokkosCore_PerformanceTest_Atomics_MinMax + build_all: $(TARGETS) test: $(TEST_TARGETS) diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewResize.hpp b/lib/kokkos/core/perf_test/PerfTest_ViewResize.hpp index 2ea81b5046..66a631e389 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewResize.hpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewResize.hpp @@ -120,7 +120,7 @@ void run_resizeview_tests123(int N, int R) { Kokkos::Timer timer; for (int r = 0; r < R; r++) { Kokkos::View a1( - Kokkos::ViewAllocateWithoutInitializing("A1"), int(N8 * 1.1)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); double* a1_ptr = a1.data(); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); @@ -201,7 +201,7 @@ void run_resizeview_tests45(int N, int R) { Kokkos::Timer timer; for (int r = 0; r < R; r++) { Kokkos::View a1( - Kokkos::ViewAllocateWithoutInitializing("A1"), int(N8 * 1.1)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); double* a1_ptr = a1.data(); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); @@ -258,7 +258,7 @@ void run_resizeview_tests6(int N, int R) { Kokkos::Timer timer; for (int r = 0; r < R; r++) { Kokkos::View a1( - Kokkos::ViewAllocateWithoutInitializing("A1"), int(N8 * 1.1)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); double* a1_ptr = a1.data(); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); @@ -311,7 +311,7 @@ void run_resizeview_tests7(int N, int R) { Kokkos::Timer timer; for (int r = 0; r < R; r++) { Kokkos::View a1( - Kokkos::ViewAllocateWithoutInitializing("A1"), int(N8 * 1.1)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); double* a1_ptr = a1.data(); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); @@ -366,7 +366,7 @@ void run_resizeview_tests8(int N, int R) { Kokkos::Timer timer; for (int r = 0; r < R; r++) { Kokkos::View a1( - Kokkos::ViewAllocateWithoutInitializing("A1"), int(N8 * 1.1)); + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); double* a1_ptr = a1.data(); Kokkos::parallel_for( N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); diff --git a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp new file mode 100644 index 0000000000..eec1c8eacc --- /dev/null +++ b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp @@ -0,0 +1,244 @@ +// export OMP_PROC_BIND=spread ; export OMP_PLACES=threads +// c++ -O2 -g -DNDEBUG -fopenmp +// ../core/perf_test/test_atomic_minmax_simple.cpp -I../core/src/ -I. -o +// test_atomic_minmax_simple.x containers/src/libkokkoscontainers.a +// core/src/libkokkoscore.a -ldl && OMP_NUM_THREADS=1 +// ./test_atomic_minmax_simple.x 10000000 + +#include +#include + +#include +#include + +#include +#include + +using exec_space = Kokkos::DefaultExecutionSpace; + +template +void test(const int length) { + Kokkos::Impl::Timer timer; + + using vector = Kokkos::View; + + vector inp("input", length); + T max = std::numeric_limits::max(); + T min = std::numeric_limits::lowest(); + + // input is max values - all min atomics will replace + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_fetch_min(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% min replacements: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + } + std::cout << "Time for 100% min replacements: " << time << std::endl; + } + + // input is min values - all max atomics will replace + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% max replacements: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + } + std::cout << "Time for 100% max replacements: " << time << std::endl; + } + + // input is max values - all max atomics will early exit + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { + T ref = max; + inner += (inp(i) != ref); + }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% max early exits: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + } + std::cout << "Time for 100% max early exits: " << time << std::endl; + } + + // input is min values - all min atomics will early exit + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_min_fetch(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { + T ref = min; + inner += (inp(i) != ref); + }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% min early exits: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + if (length > 9) std::cout << "inp(9)=" << inp(9) << std::endl; + } + std::cout << "Time for 100% min early exits: " << time << std::endl; + } + + // limit iterations for contentious test, takes ~50x longer for same length + auto con_length = length / 5; + // input is min values - some max atomics will replace + { + Kokkos::parallel_for( + 1, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); + Kokkos::fence(); + + T current(0); + timer.reset(); + Kokkos::parallel_reduce( + con_length, + KOKKOS_LAMBDA(const int i, T& inner) { + inner = Kokkos::atomic_max_fetch(&(inp(0)), inner + 1); + if (i == con_length - 1) { + Kokkos::atomic_max_fetch(&(inp(0)), max); + inner = max; + } + }, + Kokkos::Max(current)); + Kokkos::fence(); + double time = timer.seconds(); + + if (current < max) { + std::cerr << "Error in contentious max replacements: " << std::endl; + std::cerr << "final=" << current << " inp(0)=" << inp(0) << " max=" << max + << std::endl; + } + std::cout << "Time for contentious max " << con_length + << " replacements: " << time << std::endl; + } + + // input is max values - some min atomics will replace + { + Kokkos::parallel_for( + 1, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); + Kokkos::fence(); + + timer.reset(); + T current(100000000); + Kokkos::parallel_reduce( + con_length, + KOKKOS_LAMBDA(const int i, T& inner) { + inner = Kokkos::atomic_min_fetch(&(inp(0)), inner - 1); + if (i == con_length - 1) { + Kokkos::atomic_min_fetch(&(inp(0)), min); + inner = min; + } + }, + Kokkos::Min(current)); + Kokkos::fence(); + double time = timer.seconds(); + + if (current > min) { + std::cerr << "Error in contentious min replacements: " << std::endl; + std::cerr << "final=" << current << " inp(0)=" << inp(0) << " min=" << min + << std::endl; + } + std::cout << "Time for contentious min " << con_length + << " replacements: " << time << std::endl; + } +} + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + int length = 1000000; + if (argc == 2) { + length = std::stoi(argv[1]); + } + + if (length < 1) { + throw std::invalid_argument(""); + } + + std::cout << "================ int" << std::endl; + test(length); + std::cout << "================ long" << std::endl; + test(length); + std::cout << "================ long long" << std::endl; + test(length); + + std::cout << "================ unsigned int" << std::endl; + test(length); + std::cout << "================ unsigned long" << std::endl; + test(length); + std::cout << "================ unsigned long long" << std::endl; + test(length); + + std::cout << "================ float" << std::endl; + test(length); + std::cout << "================ double" << std::endl; + test(length); + } + Kokkos::finalize(); + return 0; +} diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index b4051dc57f..e0590a78a4 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -19,10 +19,6 @@ SET(KOKKOS_CORE_HEADERS) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) -IF (KOKKOS_ENABLE_ROCM) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/ROCm/*.cpp) -ENDIF() - IF (KOKKOS_ENABLE_CUDA) APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) @@ -64,6 +60,11 @@ ELSE() LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_Serial_task.cpp) ENDIF() +IF (KOKKOS_ENABLE_SYCL) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) +ENDIF() + KOKKOS_ADD_LIBRARY( kokkoscore SOURCES ${KOKKOS_CORE_SRCS} diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp deleted file mode 100644 index 6feaed80e1..0000000000 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp +++ /dev/null @@ -1,1397 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_HPP -#define KOKKOS_CUDA_EXP_ITERATE_TILE_HPP - -#include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) - -#include -#include - -#include - -//#include -// Including the file above, leads to following type of errors: -// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete -// type is not allowed As a result, recreate cuda_parallel_launch and associated -// code - -#include -#include - -namespace Kokkos { -namespace Impl { - -// ------------------------------------------------------------------ // - -template -__global__ static void cuda_parallel_launch(const DriverType driver) { - driver(); -} - -template -struct CudaLaunch { - inline CudaLaunch(const DriverType& driver, const dim3& grid, - const dim3& block) { - cuda_parallel_launch<<>>(driver); - } -}; - -// ------------------------------------------------------------------ // -template -struct apply_impl; - -// Rank 2 -// Specializations for void tag type -template -struct apply_impl<2, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - m_func(offset_0, offset_1); - } - } - } - } - } - // LR - else { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - m_func(offset_0, offset_1); - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct apply_impl<2, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - if (RP::inner_direction == RP::Left) { - // Loop over size maxnumblocks until full range covered - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1); - } - } - } - } - } else { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - m_func(Tag(), offset_0, offset_1); - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 3 -// Specializations for void tag type -template -struct apply_impl<3, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - for (index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; - tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - threadIdx.x < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - // LR - else { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id2 = blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - threadIdx.z < m_rp.m_tile[2]) { - m_func(offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for void tag type -template -struct apply_impl<3, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - if (RP::inner_direction == RP::Left) { - for (index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; - tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - threadIdx.x < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2); - } - } - } - } - } - } - } else { - for (index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; - tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; - tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id2 = blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - threadIdx.z < m_rp.m_tile[2]) { - m_func(Tag(), offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 4 -// Specializations for void tag type -template -struct apply_impl<4, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - for (index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; - tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3]) { - for (index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; - tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - // LR - else { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type tile_id2 = blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - threadIdx.y < m_rp.m_tile[2]) { - for (index_type tile_id3 = blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - threadIdx.z < m_rp.m_tile[3]) { - m_func(offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for void tag type -template -struct apply_impl<4, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - if (RP::inner_direction == RP::Left) { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - for (index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; - tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3]) { - for (index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; - tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } else { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type tile_id2 = blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - threadIdx.y < m_rp.m_tile[2]) { - for (index_type tile_id3 = blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - threadIdx.z < m_rp.m_tile[3]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 5 -// Specializations for void tag type -template -struct apply_impl<5, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = blockIdx.y % numbl2; - const index_type tile_id3 = blockIdx.y / numbl2; - const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2]; - - for (index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + - thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = blockIdx.y / numbl3; - const index_type tile_id3 = blockIdx.y % numbl3; - const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type tile_id4 = blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - threadIdx.z < m_rp.m_tile[4]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct apply_impl<5, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = blockIdx.y % numbl2; - const index_type tile_id3 = blockIdx.y / numbl2; - const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2]; - - for (index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + - thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = blockIdx.y / numbl3; - const index_type tile_id3 = blockIdx.y % numbl3; - const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type tile_id4 = blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - threadIdx.z < m_rp.m_tile[4]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 6 -// Specializations for void tag type -template -struct apply_impl<6, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = blockIdx.y % numbl2; - const index_type tile_id3 = blockIdx.y / numbl2; - const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl5 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl4) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id4 = blockIdx.z % numbl4; - const index_type tile_id5 = blockIdx.z / numbl4; - const index_type thr_id4 = threadIdx.z % m_rp.m_tile[4]; - const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4]; - - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4) { - const index_type offset_4 = - m * m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = k * m_rp.m_tile[2] + thr_id2 + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + - thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = blockIdx.y / numbl3; - const index_type tile_id3 = blockIdx.y % numbl3; - const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl4 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl5) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id4 = blockIdx.z / numbl5; - const index_type tile_id5 = blockIdx.z % numbl5; - const index_type thr_id4 = threadIdx.z / m_rp.m_tile[5]; - const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; - m += numbl4) { - const index_type offset_4 = m * m_rp.m_tile[4] + - thr_id4 + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - thr_id4 < m_rp.m_tile[4]) { - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; - n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + - (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && - thr_id5 < m_rp.m_tile[5]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct apply_impl<6, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - __device__ apply_impl(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = blockIdx.x % numbl0; - const index_type tile_id1 = blockIdx.x / numbl0; - const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = blockIdx.y % numbl2; - const index_type tile_id3 = blockIdx.y / numbl2; - const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl5 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl4) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id4 = blockIdx.z % numbl4; - const index_type tile_id5 = blockIdx.z / numbl4; - const index_type thr_id4 = threadIdx.z % m_rp.m_tile[4]; - const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4]; - - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4) { - const index_type offset_4 = - m * m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = k * m_rp.m_tile[2] + thr_id2 + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + - thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, - offset_3, offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = blockIdx.x / numbl1; - const index_type tile_id1 = blockIdx.x % numbl1; - const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = blockIdx.y / numbl3; - const index_type tile_id3 = blockIdx.y % numbl3; - const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl4 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl5) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id4 = blockIdx.z / numbl5; - const index_type tile_id5 = blockIdx.z % numbl5; - const index_type thr_id4 = threadIdx.z / m_rp.m_tile[5]; - const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; - m += numbl4) { - const index_type offset_4 = m * m_rp.m_tile[4] + - thr_id4 + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - thr_id4 < m_rp.m_tile[4]) { - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; - n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + - (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && - thr_id5 < m_rp.m_tile[5]) { - m_func(Tag(), offset_0, offset_1, offset_2, - offset_3, offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// ---------------------------------------------------------------------------------- - -template -struct DeviceIterateTile { - using index_type = typename RP::index_type; - using array_index_type = typename RP::array_index_type; - using point_type = typename RP::point_type; - - struct VoidDummy {}; - using usable_tag = typename std::conditional::value, - VoidDummy, Tag>::type; - - DeviceIterateTile(const RP& rp, const Functor& func) - : m_rp{rp}, m_func{func} {} - - private: - inline __device__ void apply() const { - apply_impl(m_rp, m_func).exec_range(); - } // end apply - - public: - inline __device__ void operator()(void) const { this->apply(); } - - inline void execute() const { - const array_index_type maxblocks = - 65535; // not true for blockIdx.x for newer archs - if (RP::rank == 2) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); - const dim3 grid( - std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, - maxblocks), - std::min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, - maxblocks), - 1); - CudaLaunch(*this, grid, block); - } else if (RP::rank == 3) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); - const dim3 grid( - std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, - maxblocks), - std::min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, - maxblocks), - std::min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, - maxblocks)); - CudaLaunch(*this, grid, block); - } else if (RP::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z - const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], - m_rp.m_tile[3]); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); - const dim3 grid( - std::min( - static_cast(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), - static_cast(maxblocks)), - std::min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y, - maxblocks), - std::min((m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, - maxblocks)); - CudaLaunch(*this, grid, block); - } else if (RP::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to - // threadIdx.z - const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); - KOKKOS_ASSERT(block.z > 0); - const dim3 grid( - std::min( - static_cast(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), - static_cast(maxblocks)), - std::min( - static_cast(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]), - static_cast(maxblocks)), - std::min((m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, - maxblocks)); - CudaLaunch(*this, grid, block); - } else if (RP::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to - // threadIdx.z - const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], - m_rp.m_tile[4] * m_rp.m_tile[5]); - const dim3 grid( - std::min( - static_cast(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), - static_cast(maxblocks)), - std::min( - static_cast(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]), - static_cast(maxblocks)), - std::min( - static_cast(m_rp.m_tile_end[4] * m_rp.m_tile_end[5]), - static_cast(maxblocks))); - CudaLaunch(*this, grid, block); - } else { - printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); - Kokkos::abort("Aborting"); - } - - } // end execute - - protected: - const RP m_rp; - const Functor m_func; -}; - -} // namespace Impl -} // namespace Kokkos - -#endif -#endif diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp deleted file mode 100644 index 0425fe6ed5..0000000000 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp +++ /dev/null @@ -1,3063 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP -#define KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP - -#include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) - -#include - -#include - -// #include -// Including the file above leads to following type of errors: -// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete -// type is not allowed use existing Kokkos functionality, e.g. max blocks, once -// resolved - -#include -#include - -namespace Kokkos { -namespace Impl { - -namespace Refactor { - -// ------------------------------------------------------------------ // -// ParallelFor iteration pattern -template -struct DeviceIterateTile; - -// Rank 2 -// Specializations for void tag type -template -struct DeviceIterateTile<2, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - m_func(offset_0, offset_1); - } - } - } - } - } - // LR - else { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - m_func(offset_0, offset_1); - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct DeviceIterateTile<2, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - if (RP::inner_direction == RP::Left) { - // Loop over size maxnumblocks until full range covered - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1); - } - } - } - } - } else { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - m_func(Tag(), offset_0, offset_1); - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 3 -// Specializations for void tag type -template -struct DeviceIterateTile<3, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - // LL - if (RP::inner_direction == RP::Left) { - for (index_type tile_id2 = (index_type)blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.z < m_rp.m_tile[2]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - // LR - else { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id2 = (index_type)blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.z < m_rp.m_tile[2]) { - m_func(offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile<3, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - inline __device__ void exec_range() const { - if (RP::inner_direction == RP::Left) { - for (index_type tile_id2 = (index_type)blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.z < m_rp.m_tile[2]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2); - } - } - } - } - } - } - } else { - for (index_type tile_id0 = (index_type)blockIdx.x; - tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x) { - const index_type offset_0 = tile_id0 * m_rp.m_tile[0] + - (index_type)threadIdx.x + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - (index_type)threadIdx.x < m_rp.m_tile[0]) { - for (index_type tile_id1 = (index_type)blockIdx.y; - tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - (index_type)threadIdx.y < m_rp.m_tile[1]) { - for (index_type tile_id2 = (index_type)blockIdx.z; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.z < m_rp.m_tile[2]) { - m_func(Tag(), offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 4 -// Specializations for void tag type -template -struct DeviceIterateTile<4, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - // LL - if (RP::inner_direction == RP::Left) { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - for (index_type tile_id3 = (index_type)blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - (index_type)threadIdx.z < m_rp.m_tile[3]) { - for (index_type tile_id2 = (index_type)blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.y < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - // LR - else { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type tile_id2 = (index_type)blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.y < m_rp.m_tile[2]) { - for (index_type tile_id3 = (index_type)blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - (index_type)threadIdx.z < m_rp.m_tile[3]) { - m_func(offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile<4, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if (RP::inner_direction == RP::Left) { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - for (index_type tile_id3 = (index_type)blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - (index_type)threadIdx.z < m_rp.m_tile[3]) { - for (index_type tile_id2 = (index_type)blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.y < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } else { - const index_type temp0 = m_rp.m_tile_end[0]; - const index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = tile_id1 * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type tile_id2 = (index_type)blockIdx.y; - tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y) { - const index_type offset_2 = tile_id2 * m_rp.m_tile[2] + - (index_type)threadIdx.y + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - (index_type)threadIdx.y < m_rp.m_tile[2]) { - for (index_type tile_id3 = (index_type)blockIdx.z; - tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z) { - const index_type offset_3 = tile_id3 * m_rp.m_tile[3] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - (index_type)threadIdx.z < m_rp.m_tile[3]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3); - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 5 -// Specializations for void tag type -template -struct DeviceIterateTile<5, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y % numbl2; - const index_type tile_id3 = (index_type)blockIdx.y / numbl2; - const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; - - for (index_type tile_id4 = (index_type)blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - (index_type)threadIdx.z < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + - thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y / numbl3; - const index_type tile_id3 = (index_type)blockIdx.y % numbl3; - const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type tile_id4 = (index_type)blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - (index_type)threadIdx.z < m_rp.m_tile[4]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct DeviceIterateTile<5, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y % numbl2; - const index_type tile_id3 = (index_type)blockIdx.y / numbl2; - const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; - - for (index_type tile_id4 = (index_type)blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - (index_type)threadIdx.z < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = i * m_rp.m_tile[0] + - thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y / numbl3; - const index_type tile_id3 = (index_type)blockIdx.y % numbl3; - const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type tile_id4 = (index_type)blockIdx.z; - tile_id4 < m_rp.m_tile_end[4]; - tile_id4 += gridDim.z) { - const index_type offset_4 = tile_id4 * m_rp.m_tile[4] + - (index_type)threadIdx.z + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - (index_type)threadIdx.z < m_rp.m_tile[4]) { - m_func(Tag(), offset_0, offset_1, offset_2, offset_3, - offset_4); - } - } - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Rank 6 -// Specializations for void tag type -template -struct DeviceIterateTile<6, RP, Functor, void> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y % numbl2; - const index_type tile_id3 = (index_type)blockIdx.y / numbl2; - const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl5 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl4) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id4 = (index_type)blockIdx.z % numbl4; - const index_type tile_id5 = (index_type)blockIdx.z / numbl4; - const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4]; - const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; - - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4) { - const index_type offset_4 = - m * m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = k * m_rp.m_tile[2] + thr_id2 + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + - thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y / numbl3; - const index_type tile_id3 = (index_type)blockIdx.y % numbl3; - const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl4 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl5) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id4 = (index_type)blockIdx.z / numbl5; - const index_type tile_id5 = (index_type)blockIdx.z % numbl5; - const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5]; - const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; - m += numbl4) { - const index_type offset_4 = m * m_rp.m_tile[4] + - thr_id4 + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - thr_id4 < m_rp.m_tile[4]) { - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; - n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + - (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && - thr_id5 < m_rp.m_tile[5]) { - m_func(offset_0, offset_1, offset_2, offset_3, - offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -// Specializations for tag type -template -struct DeviceIterateTile<6, RP, Functor, Tag> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_) - : m_rp(rp_), m_func(f_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - // LL - if (RP::inner_direction == RP::Left) { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl1 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl0) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x % numbl0; - const index_type tile_id1 = (index_type)blockIdx.x / numbl0; - const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0]; - const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl3 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl2) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y % numbl2; - const index_type tile_id3 = (index_type)blockIdx.y / numbl2; - const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2]; - const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks); - const index_type numbl5 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl4) - : (temp1 <= max_blocks ? temp1 : max_blocks)); - - const index_type tile_id4 = (index_type)blockIdx.z % numbl4; - const index_type tile_id5 = (index_type)blockIdx.z / numbl4; - const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4]; - const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; - - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4) { - const index_type offset_4 = - m * m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = - l * m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = k * m_rp.m_tile[2] + thr_id2 + - (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && - thr_id2 < m_rp.m_tile[2]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; - j += numbl1) { - const index_type offset_1 = j * m_rp.m_tile[1] + - thr_id1 + - (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && - thr_id1 < m_rp.m_tile[1]) { - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; - i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + - (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && - thr_id0 < m_rp.m_tile[0]) { - m_func(Tag(), offset_0, offset_1, offset_2, - offset_3, offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - // LR - else { - index_type temp0 = m_rp.m_tile_end[0]; - index_type temp1 = m_rp.m_tile_end[1]; - const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl0 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl1) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id0 = (index_type)blockIdx.x / numbl1; - const index_type tile_id1 = (index_type)blockIdx.x % numbl1; - const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1]; - const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; - - temp0 = m_rp.m_tile_end[2]; - temp1 = m_rp.m_tile_end[3]; - const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl2 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl3) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id2 = (index_type)blockIdx.y / numbl3; - const index_type tile_id3 = (index_type)blockIdx.y % numbl3; - const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3]; - const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; - - temp0 = m_rp.m_tile_end[4]; - temp1 = m_rp.m_tile_end[5]; - const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks); - const index_type numbl4 = - (temp0 * temp1 > max_blocks - ? index_type(max_blocks / numbl5) - : (temp0 <= max_blocks ? temp0 : max_blocks)); - - const index_type tile_id4 = (index_type)blockIdx.z / numbl5; - const index_type tile_id5 = (index_type)blockIdx.z % numbl5; - const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5]; - const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; - - for (index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0) { - const index_type offset_0 = - i * m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; - if (offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0]) { - for (index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1) { - const index_type offset_1 = - j * m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; - if (offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1]) { - for (index_type k = tile_id2; k < m_rp.m_tile_end[2]; - k += numbl2) { - const index_type offset_2 = - k * m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; - if (offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2]) { - for (index_type l = tile_id3; l < m_rp.m_tile_end[3]; - l += numbl3) { - const index_type offset_3 = l * m_rp.m_tile[3] + thr_id3 + - (index_type)m_rp.m_lower[3]; - if (offset_3 < m_rp.m_upper[3] && - thr_id3 < m_rp.m_tile[3]) { - for (index_type m = tile_id4; m < m_rp.m_tile_end[4]; - m += numbl4) { - const index_type offset_4 = m * m_rp.m_tile[4] + - thr_id4 + - (index_type)m_rp.m_lower[4]; - if (offset_4 < m_rp.m_upper[4] && - thr_id4 < m_rp.m_tile[4]) { - for (index_type n = tile_id5; n < m_rp.m_tile_end[5]; - n += numbl5) { - const index_type offset_5 = - n * m_rp.m_tile[5] + thr_id5 + - (index_type)m_rp.m_lower[5]; - if (offset_5 < m_rp.m_upper[5] && - thr_id5 < m_rp.m_tile[5]) { - m_func(Tag(), offset_0, offset_1, offset_2, - offset_3, offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; -}; - -} // namespace Refactor - -// ---------------------------------------------------------------------------------- - -namespace Reduce { - -template -using is_void = std::is_same; - -template -struct is_array_type : std::false_type { - using value_type = T; -}; - -template -struct is_array_type : std::true_type { - using value_type = T; -}; - -template -struct is_array_type : std::true_type { - using value_type = T; -}; - -// ------------------------------------------------------------------ // -template -struct DeviceIterateTile; - -// ParallelReduce iteration pattern -// Scalar reductions - -// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of -// tiles and reduction algorithm constraints extract n-dim tile offsets (i.e. -// tile's global starting mulit-index) from the tileid = blockid using tile -// dimensions local indices within a tile extracted from (index_type)threadIdx.x -// using tile dims, constrained by blocksize combine tile and local id info for -// multi-dim global ids - -// Pattern: -// Each block+thread is responsible for a tile+local_id combo (additional when -// striding by num_blocks) -// 1. create offset arrays -// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max -// num blocks) -// 3. temps set for tile_idx and thrd_idx, which will be modified -// 4. if LL vs LR: -// determine tile starting point offsets (multidim) -// determine local index offsets (multidim) -// concatentate tile offset + local offset for global multi-dim index -// if offset withinin range bounds AND local offset within tile bounds, call -// functor - -// ValueType = T -// Rank 2 -// Specializations for void tag type -template -struct DeviceIterateTile< - 2, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - // Deduce this blocks tile_id - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_v); - } - } - } - } - - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 2, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Rank 3 -// Specializations for void tag type -template -struct DeviceIterateTile< - 3, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile< - 3, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Rank 4 -// Specializations for void tag type -template -struct DeviceIterateTile< - 4, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile< - 4, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Rank 5 -// Specializations for void tag type -template -struct DeviceIterateTile< - 5, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 5, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Rank 6 -// Specializations for void tag type -template -struct DeviceIterateTile< - 6, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 6, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, ValueType& v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - ValueType& m_v; -}; - -// ValueType = T[], T* -// Rank 2 -// Specializations for void tag type -template -struct DeviceIterateTile< - 2, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 2, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_v); - } - } - } // end for loop over num_tiles - product of tiles in each direction - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Rank 3 -// Specializations for void tag type -template -struct DeviceIterateTile< - 3, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = - (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, - // add to m_offset right away - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile< - 3, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - inline __device__ void exec_range() const { - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Rank 4 -// Specializations for void tag type -template -struct DeviceIterateTile< - 4, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Specializations for void tag type -template -struct DeviceIterateTile< - 4, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - inline __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, - value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Rank 5 -// Specializations for void tag type -template -struct DeviceIterateTile< - 5, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 5, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Rank 6 -// Specializations for void tag type -template -struct DeviceIterateTile< - 6, RP, Functor, void, ValueType, - typename std::enable_if::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -// Specializations for tag type -template -struct DeviceIterateTile< - 6, RP, Functor, Tag, ValueType, - typename std::enable_if::value && - !is_void::value>::type> { - using index_type = typename RP::index_type; - using value_type = typename is_array_type::value_type; - - __device__ DeviceIterateTile(const RP& rp_, const Functor& f_, value_type* v_) - : m_rp(rp_), m_func(f_), m_v(v_) {} - - static constexpr index_type max_blocks = 65535; - // static constexpr index_type max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount); - - inline __device__ void exec_range() const { - // enum { max_blocks = - // static_cast(Kokkos::Impl::CudaTraits::UpperBoundGridCount) }; - // const index_type max_blocks = static_cast( - // Kokkos::Impl::cuda_internal_maximum_grid_count() ); - if ((index_type)blockIdx.x < m_rp.m_num_tiles && - (index_type)threadIdx.y < m_rp.m_prod_tile_dims) { - index_type m_offset[RP::rank]; // tile starting global id offset - index_type m_local_offset[RP::rank]; // tile starting global id offset - - for (index_type tileidx = (index_type)blockIdx.x; - tileidx < m_rp.m_num_tiles; tileidx += gridDim.x) { - index_type tile_idx = - tileidx; // temp because tile_idx will be modified while - // determining tile starting point offsets - index_type thrd_idx = (index_type)threadIdx.y; - bool in_bounds = true; - - // LL - if (RP::inner_direction == RP::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - // LR - else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + - m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - - // tile-local indices identified with (index_type)threadIdx.y - m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); - thrd_idx /= m_rp.m_tile[i]; - - m_offset[i] += m_local_offset[i]; - if (!(m_offset[i] < m_rp.m_upper[i] && - m_local_offset[i] < m_rp.m_tile[i])) { - in_bounds &= false; - } - } - if (in_bounds) { - m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], - m_offset[4], m_offset[5], m_v); - } - } - } - } - } // end exec_range - - private: - const RP& m_rp; - const Functor& m_func; - value_type* m_v; -}; - -} // namespace Reduce - -// ---------------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - -#endif -#endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index cbe1a7e74a..4a30c914f0 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -146,9 +146,9 @@ void CudaSpace::access_error(const void *const) { bool CudaUVMSpace::available() { #if defined(CUDA_VERSION) && !defined(__APPLE__) - enum { UVM_available = true }; + enum : bool { UVM_available = true }; #else - enum { UVM_available = false }; + enum : bool { UVM_available = false }; #endif return UVM_available; } @@ -201,8 +201,15 @@ CudaHostPinnedSpace::CudaHostPinnedSpace() {} void *CudaSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } + void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *CudaSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; auto error_code = cudaMalloc(&ptr, arg_alloc_size); @@ -219,9 +226,7 @@ void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size, if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, ptr, - reported_size); + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; } @@ -231,6 +236,12 @@ void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const { } void *CudaUVMSpace::allocate(const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *CudaUVMSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; Cuda::impl_static_fence(); @@ -260,19 +271,22 @@ void *CudaUVMSpace::allocate(const char *arg_label, const size_t arg_alloc_size, if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, ptr, - reported_size); + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; } - void *CudaHostPinnedSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } void *CudaHostPinnedSpace::allocate(const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *CudaHostPinnedSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; auto error_code = cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault); @@ -288,9 +302,7 @@ void *CudaHostPinnedSpace::allocate(const char *arg_label, if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, ptr, - reported_size); + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; } @@ -304,12 +316,17 @@ void CudaSpace::deallocate(void *const arg_alloc_ptr, void CudaSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void CudaSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr, - reported_size); + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } try { @@ -327,13 +344,21 @@ void CudaUVMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, , const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void CudaUVMSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size + + , + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { Cuda::impl_static_fence(); if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr, - reported_size); + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } try { if (arg_alloc_ptr != nullptr) { @@ -349,17 +374,22 @@ void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const { deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); } - void CudaHostPinnedSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} + +void CudaHostPinnedSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr, - reported_size); + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } try { CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); @@ -375,7 +405,7 @@ void CudaHostPinnedSpace::deallocate(const char *arg_label, namespace Kokkos { namespace Impl { -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord SharedAllocationRecord::s_root_record; @@ -551,7 +581,7 @@ SharedAllocationRecord::SharedAllocationRecord( // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function : SharedAllocationRecord( -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif Impl::checked_allocation_with_header(arg_space, arg_label, @@ -582,7 +612,7 @@ SharedAllocationRecord::SharedAllocationRecord( // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function : SharedAllocationRecord( -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif Impl::checked_allocation_with_header(arg_space, arg_label, @@ -610,7 +640,7 @@ SharedAllocationRecord:: // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function : SharedAllocationRecord( -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -830,7 +860,7 @@ void SharedAllocationRecord::print_records( std::ostream &s, const Kokkos::CudaSpace &, bool detail) { (void)s; (void)detail; -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord *r = &s_root_record; char buffer[256]; @@ -896,7 +926,7 @@ void SharedAllocationRecord::print_records( #else Kokkos::Impl::throw_runtime_exception( "SharedAllocationHeader::print_records only works with " - "KOKKOS_DEBUG enabled"); + "KOKKOS_ENABLE_DEBUG enabled"); #endif } @@ -904,13 +934,13 @@ void SharedAllocationRecord::print_records( std::ostream &s, const Kokkos::CudaUVMSpace &, bool detail) { (void)s; (void)detail; -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord::print_host_accessible_records( s, "CudaUVM", &s_root_record, detail); #else Kokkos::Impl::throw_runtime_exception( "SharedAllocationHeader::print_records only works with " - "KOKKOS_DEBUG enabled"); + "KOKKOS_ENABLE_DEBUG enabled"); #endif } @@ -918,13 +948,13 @@ void SharedAllocationRecord::print_records( std::ostream &s, const Kokkos::CudaHostPinnedSpace &, bool detail) { (void)s; (void)detail; -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord::print_host_accessible_records( s, "CudaHostPinned", &s_root_record, detail); #else Kokkos::Impl::throw_runtime_exception( "SharedAllocationHeader::print_records only works with " - "KOKKOS_DEBUG enabled"); + "KOKKOS_ENABLE_DEBUG enabled"); #endif } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index 5a143fd267..0d6d3bdb3a 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -198,6 +198,39 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, LaunchBounds{}); } +// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1) +// NOTE these number can be obtained several ways: +// * One option is to download the CUDA Occupancy Calculator spreadsheet, select +// "Compute Capability" first and check what is the smallest "Shared Memory +// Size Config" that is available. The "Shared Memory Per Multiprocessor" in +// bytes is then to be found below in the summary. +// * Another option would be to look for the information in the "Tuning +// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in +// the "Shared Memory" section (more tedious) +inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { + int const compute_capability = properties.major * 10 + properties.minor; + return [compute_capability]() { + switch (compute_capability) { + case 30: + case 32: + case 35: return 16; + case 37: return 80; + case 50: + case 53: + case 60: + case 62: return 64; + case 52: + case 61: return 96; + case 70: + case 80: return 8; + case 75: return 32; + default: + Kokkos::Impl::throw_runtime_exception( + "Unknown device in cuda block size deduction"); + } + return 0; + }() * 1024; +} } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp new file mode 100644 index 0000000000..d6fadd82c0 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -0,0 +1,210 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_CUDA_GRAPHNODEKERNEL_IMPL_HPP +#define KOKKOS_KOKKOS_CUDA_GRAPHNODEKERNEL_IMPL_HPP + +#include + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_CUDA_ENABLE_GRAPHS) + +#include + +#include // GraphAccess needs to be complete +#include // SharedAllocationRecord + +#include +#include +#include + +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class GraphNodeKernelImpl + : public PatternImplSpecializationFromTag::type { + private: + using base_t = + typename PatternImplSpecializationFromTag::type; + using size_type = Kokkos::Cuda::size_type; + // These are really functioning as optional references, though I'm not sure + // that the cudaGraph_t one needs to be since it's a pointer under the + // covers and we're not modifying it + Kokkos::ObservingRawPtr m_graph_ptr = nullptr; + Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; + // Note: owned pointer to CudaSpace memory (used for global memory launches), + // which we're responsible for deallocating, but not responsible for calling + // its destructor. + using Record = Kokkos::Impl::SharedAllocationRecord; + // Basically, we have to make this mutable for the same reasons that the + // global kernel buffers in the Cuda instance are mutable... + mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; + + public: + using Policy = PolicyType; + using graph_kernel = GraphNodeKernelImpl; + + // TODO Ensure the execution space of the graph is the same as the one + // attached to the policy? + // TODO @graph kernel name info propagation + template + GraphNodeKernelImpl(std::string, Kokkos::Cuda const&, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + // This is super ugly, but it works for now and is the most minimal change + // to the codebase for now... + : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, + (ArgsDeduced &&) args...) {} + + // FIXME @graph Forward through the instance once that works in the backends + template + GraphNodeKernelImpl(Kokkos::Cuda const& ex, Functor arg_functor, + PolicyDeduced&& arg_policy) + : GraphNodeKernelImpl("", ex, std::move(arg_functor), + (PolicyDeduced &&) arg_policy) {} + + ~GraphNodeKernelImpl() { + if (m_driver_storage) { + // We should be the only owner, but this is still the easiest way to + // allocate and deallocate aligned memory for these sorts of things + Record::decrement(Record::get_record(m_driver_storage)); + } + } + + void set_cuda_graph_ptr(cudaGraph_t* arg_graph_ptr) { + m_graph_ptr = arg_graph_ptr; + } + void set_cuda_graph_node_ptr(cudaGraphNode_t* arg_node_ptr) { + m_graph_node_ptr = arg_node_ptr; + } + cudaGraphNode_t* get_cuda_graph_node_ptr() const { return m_graph_node_ptr; } + cudaGraph_t const* get_cuda_graph_ptr() const { return m_graph_ptr; } + + Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { + KOKKOS_EXPECTS(m_driver_storage == nullptr) + + auto* record = Record::allocate( + Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage", + sizeof(base_t)); + + Record::increment(record); + m_driver_storage = reinterpret_cast(record->data()); + KOKKOS_ENSURES(m_driver_storage != nullptr) + return m_driver_storage; + } +}; + +struct CudaGraphNodeAggregateKernel { + using graph_kernel = CudaGraphNodeAggregateKernel; + + // Aggregates don't need a policy, but for the purposes of checking the static + // assertions about graph kerenls, + struct Policy { + using is_graph_kernel = std::true_type; + }; +}; + +template ::type> +struct get_graph_node_kernel_type + : identity> {}; +template +struct get_graph_node_kernel_type + : identity> {}; + +//============================================================================== +// {{{1 + +template +auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + // TODO @graphs we need to somehow indicate the need for a fence in the + // destructor of the GraphImpl object (so that we don't have to + // just always do it) + return kernel_as_graph_kernel.allocate_driver_memory_buffer(); +} + +template +auto const& get_cuda_graph_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + cudaGraph_t const* graph_ptr = kernel_as_graph_kernel.get_cuda_graph_ptr(); + KOKKOS_EXPECTS(graph_ptr != nullptr); + return *graph_ptr; +} + +template +auto& get_cuda_graph_node_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + auto* graph_node_ptr = kernel_as_graph_kernel.get_cuda_graph_node_ptr(); + KOKKOS_EXPECTS(graph_node_ptr != nullptr); + return *graph_node_ptr; +} + +// end get_cuda_graph_*() helper functions }}}1 +//============================================================================== + +} // end namespace Impl +} // end namespace Kokkos + +#endif // defined(KOKKOS_ENABLE_CUDA) +#endif // KOKKOS_KOKKOS_CUDA_GRAPHNODEKERNEL_IMPL_HPP diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp similarity index 52% rename from lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp rename to lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp index 989a4aec90..f4539cd2ca 100644 --- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp @@ -42,85 +42,62 @@ //@HEADER */ -#include +#ifndef KOKKOS_KOKKOS_CUDA_GRAPHNODE_IMPL_HPP +#define KOKKOS_KOKKOS_CUDA_GRAPHNODE_IMPL_HPP + #include -#if !defined(KOKKOS_ROCM_INVOKE_H) -#define KOKKOS_ROCM_INVOKE_H +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_CUDA_ENABLE_GRAPHS) + +#include + +#include // GraphAccess needs to be complete + +#include +#include namespace Kokkos { namespace Impl { -template ()), int>::type = 0> -KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs) { - f(Tag(), static_cast(xs)...); -} +template <> +struct GraphNodeBackendSpecificDetails { + cudaGraphNode_t node = nullptr; -template ()), int>::type = 0> -KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs) { - f(static_cast(xs)...); -} + //---------------------------------------------------------------------------- + // {{{2 -template -struct rocm_invoke_fn { - F* f; - rocm_invoke_fn(F& f_) : f(&f_) {} + explicit GraphNodeBackendSpecificDetails() = default; - template - KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const { - rocm_invoke(*f, static_cast(xs)...); - } + explicit GraphNodeBackendSpecificDetails( + _graph_node_is_root_ctor_tag) noexcept {} + + // end Ctors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- }; -template -KOKKOS_INLINE_FUNCTION rocm_invoke_fn make_rocm_invoke_fn(F& f) { - return {f}; -} +template +struct GraphNodeBackendDetailsBeforeTypeErasure { + protected: + //---------------------------------------------------------------------------- + // {{{2 -template -KOKKOS_INLINE_FUNCTION T& rocm_unwrap(T& x) { - return x; -} + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::Cuda const&, Kernel&, PredecessorRef const&, + GraphNodeBackendSpecificDetails&) noexcept {} -template -KOKKOS_INLINE_FUNCTION T& rocm_unwrap(std::reference_wrapper x) { - return x; -} + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::Cuda const&, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails&) noexcept {} -template -struct rocm_capture_fn { - F f; - T data; - - KOKKOS_INLINE_FUNCTION rocm_capture_fn(F f_, T x) : f(f_), data(x) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const { - f(rocm_unwrap(data), static_cast(xs)...); - } + // end ctors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- }; -template -KOKKOS_INLINE_FUNCTION rocm_capture_fn rocm_capture(F f, T x) { - return {f, x}; -} +} // end namespace Impl +} // end namespace Kokkos -template -KOKKOS_INLINE_FUNCTION auto rocm_capture(F f, T x, U y, Ts... xs) - -> decltype(rocm_capture(rocm_capture(f, x), y, xs...)) { - return rocm_capture(rocm_capture(f, x), y, xs...); -} +#include -struct rocm_apply_op { - template - KOKKOS_INLINE_FUNCTION void operator()(F&& f, Ts&&... xs) const { - f(static_cast(xs)...); - } -}; - -} // namespace Impl -} // namespace Kokkos - -#endif +#endif // defined(KOKKOS_ENABLE_CUDA) +#endif // KOKKOS_KOKKOS_CUDA_GRAPHNODE_IMPL_HPP diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp new file mode 100644 index 0000000000..3de7a69916 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -0,0 +1,219 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP +#define KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP + +#include + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_CUDA_ENABLE_GRAPHS) + +#include + +#include // GraphAccess needs to be complete + +// GraphNodeImpl needs to be complete because GraphImpl here is a full +// specialization and not just a partial one +#include +#include + +#include +#include + +namespace Kokkos { +namespace Impl { + +template <> +struct GraphImpl { + public: + using execution_space = Kokkos::Cuda; + + private: + execution_space m_execution_space; + cudaGraph_t m_graph = nullptr; + cudaGraphExec_t m_graph_exec = nullptr; + + using cuda_graph_flags_t = unsigned int; + + using node_details_t = GraphNodeBackendSpecificDetails; + + void _instantiate_graph() { + constexpr size_t error_log_size = 256; + cudaGraphNode_t error_node = nullptr; + char error_log[error_log_size]; + CUDA_SAFE_CALL(cudaGraphInstantiate(&m_graph_exec, m_graph, &error_node, + error_log, error_log_size)); + // TODO @graphs print out errors + } + + public: + using root_node_impl_t = + GraphNodeImpl; + using aggregate_kernel_impl_t = CudaGraphNodeAggregateKernel; + using aggregate_node_impl_t = + GraphNodeImpl; + + // Not moveable or copyable; it spends its whole life as a shared_ptr in the + // Graph object + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl const&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; + ~GraphImpl() { + // TODO @graphs we need to somehow indicate the need for a fence in the + // destructor of the GraphImpl object (so that we don't have to + // just always do it) + m_execution_space.fence(); + KOKKOS_EXPECTS(bool(m_graph)) + if (bool(m_graph_exec)) { + CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec)); + } + CUDA_SAFE_CALL(cudaGraphDestroy(m_graph)); + }; + + explicit GraphImpl(Kokkos::Cuda arg_instance) + : m_execution_space(std::move(arg_instance)) { + CUDA_SAFE_CALL(cudaGraphCreate(&m_graph, cuda_graph_flags_t{0})); + } + + void add_node(std::shared_ptr const& arg_node_ptr) { + // All of the predecessors are just added as normal, so all we need to + // do here is add an empty node + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), + m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + } + + template + // requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl + // Also requires that the kernel has the graph node tag in it's policy + void add_node(std::shared_ptr const& arg_node_ptr) { + static_assert( + NodeImpl::kernel_type::Policy::is_graph_kernel::value, + "Something has gone horribly wrong, but it's too complicated to " + "explain here. Buy Daisy a coffee and she'll explain it to you."); + KOKKOS_EXPECTS(bool(arg_node_ptr)); + // The Kernel launch from the execute() method has been shimmed to insert + // the node into the graph + auto& kernel = arg_node_ptr->get_kernel(); + // note: using arg_node_ptr->node_details_t::node caused an ICE in NVCC 10.1 + auto& cuda_node = static_cast(arg_node_ptr.get())->node; + KOKKOS_EXPECTS(!bool(cuda_node)); + kernel.set_cuda_graph_ptr(&m_graph); + kernel.set_cuda_graph_node_ptr(&cuda_node); + kernel.execute(); + KOKKOS_ENSURES(bool(cuda_node)); + } + + template + // requires PredecessorRef is a specialization of GraphNodeRef that has + // already been added to this graph and NodeImpl is a specialization of + // GraphNodeImpl that has already been added to this graph. + void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { + KOKKOS_EXPECTS(bool(arg_node_ptr)) + auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); + KOKKOS_EXPECTS(bool(pred_ptr)) + + // clang-format off + // NOTE const-qualifiers below are commented out because of an API break + // from CUDA 10.0 to CUDA 10.1 + // cudaGraphAddDependencies(cudaGraph_t, cudaGraphNode_t*, cudaGraphNode_t*, size_t) + // cudaGraphAddDependencies(cudaGraph_t, const cudaGraphNode_t*, const cudaGraphNode_t*, size_t) + // clang-format on + auto /*const*/& pred_cuda_node = pred_ptr->node_details_t::node; + KOKKOS_EXPECTS(bool(pred_cuda_node)) + + auto /*const*/& cuda_node = arg_node_ptr->node_details_t::node; + KOKKOS_EXPECTS(bool(cuda_node)) + + CUDA_SAFE_CALL( + cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1)); + } + + void submit() { + if (!bool(m_graph_exec)) { + _instantiate_graph(); + } + CUDA_SAFE_CALL( + cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream())); + } + + execution_space const& get_execution_space() const noexcept { + return m_execution_space; + } + + auto create_root_node_ptr() { + KOKKOS_EXPECTS(bool(m_graph)) + KOKKOS_EXPECTS(!bool(m_graph_exec)) + auto rv = std::make_shared( + get_execution_space(), _graph_node_is_root_ctor_tag{}); + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + KOKKOS_ENSURES(bool(rv->node_details_t::node)) + return rv; + } + + template + // See requirements/expectations in GraphBuilder + auto create_aggregate_ptr(PredecessorRefs&&...) { + // The attachment to predecessors, which is all we really need, happens + // in the generic layer, which calls through to add_predecessor for + // each predecessor ref, so all we need to do here is create the (trivial) + // aggregate node. + return std::make_shared( + m_execution_space, _graph_node_kernel_ctor_tag{}, + aggregate_kernel_impl_t{}); + } +}; + +} // end namespace Impl +} // end namespace Kokkos + +#endif // defined(KOKKOS_ENABLE_CUDA) +#endif // KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp new file mode 100644 index 0000000000..a9a62380e5 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp @@ -0,0 +1,710 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_HALF_HPP_ +#define KOKKOS_CUDA_HALF_HPP_ + +#include +#ifdef KOKKOS_ENABLE_CUDA +#if !(defined(KOKKOS_COMPILER_CLANG) && KOKKOS_COMPILER_CLANG < 900) && \ + !(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL50) || \ + defined(KOKKOS_ARCH_MAXWELL52)) +#include + +#ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED +// Make sure no one else tries to define half_t +#define KOKKOS_IMPL_HALF_TYPE_DEFINED + +namespace Kokkos { +namespace Impl { +struct half_impl_t { + using type = __half; +}; +} // namespace Impl +namespace Experimental { + +// Forward declarations +class half_t; + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(bool val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val); + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t); +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t); + +class half_t { + public: + using impl_type = Kokkos::Impl::half_impl_t::type; + + private: + impl_type val; + + public: + KOKKOS_FUNCTION + half_t() : val(0.0F) {} + + // Don't support implicit conversion back to impl_type. + // impl_type is a storage only type on host. + KOKKOS_FUNCTION + explicit operator impl_type() const { return val; } + KOKKOS_FUNCTION + explicit operator float() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator bool() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator double() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator short() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator int() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator long() const { return cast_from_half(*this); } + KOKKOS_FUNCTION + explicit operator long long() const { + return cast_from_half(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned short() const { + return cast_from_half(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned int() const { + return cast_from_half(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned long() const { + return cast_from_half(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned long long() const { + return cast_from_half(*this); + } + + KOKKOS_FUNCTION + half_t(impl_type rhs) : val(rhs) {} + KOKKOS_FUNCTION + explicit half_t(float rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(bool rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(double rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(short rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(int rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(long rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(long long rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {} + + // Unary operators + KOKKOS_FUNCTION + half_t operator+() const { + half_t tmp = *this; +#ifdef __CUDA_ARCH__ + tmp.val = +tmp.val; +#else + tmp.val = __float2half(+__half2float(tmp.val)); +#endif + return tmp; + } + + KOKKOS_FUNCTION + half_t operator-() const { + half_t tmp = *this; +#ifdef __CUDA_ARCH__ + tmp.val = -tmp.val; +#else + tmp.val = __float2half(-__half2float(tmp.val)); +#endif + return tmp; + } + + // Prefix operators + KOKKOS_FUNCTION + half_t& operator++() { +#ifdef __CUDA_ARCH__ + ++val; +#else + float tmp = __half2float(val); + ++tmp; + val = __float2half(tmp); +#endif + return *this; + } + + KOKKOS_FUNCTION + half_t& operator--() { +#ifdef __CUDA_ARCH__ + --val; +#else + float tmp = __half2float(val); + --tmp; + val = __float2half(tmp); +#endif + return *this; + } + + // Postfix operators + KOKKOS_FUNCTION + half_t operator++(int) { + half_t tmp = *this; + operator++(); + return tmp; + } + + KOKKOS_FUNCTION + half_t operator--(int) { + half_t tmp = *this; + operator--(); + return tmp; + } + + // Binary operators + KOKKOS_FUNCTION + half_t& operator=(impl_type rhs) { + val = rhs; + return *this; + } + + template + KOKKOS_FUNCTION half_t& operator=(T rhs) { + val = cast_to_half(rhs).val; + return *this; + } + + // Compound operators + KOKKOS_FUNCTION + half_t& operator+=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val += rhs.val; +#else + val = __float2half(__half2float(val) + __half2float(rhs.val)); +#endif + return *this; + } + + KOKKOS_FUNCTION + half_t& operator-=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val -= rhs.val; +#else + val = __float2half(__half2float(val) - __half2float(rhs.val)); +#endif + return *this; + } + + KOKKOS_FUNCTION + half_t& operator*=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val *= rhs.val; +#else + val = __float2half(__half2float(val) * __half2float(rhs.val)); +#endif + return *this; + } + + KOKKOS_FUNCTION + half_t& operator/=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val /= rhs.val; +#else + val = __float2half(__half2float(val) / __half2float(rhs.val)); +#endif + return *this; + } + + // Binary Arithmetic + KOKKOS_FUNCTION + half_t friend operator+(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val += rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); +#endif + return lhs; + } + + KOKKOS_FUNCTION + half_t friend operator-(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val -= rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); +#endif + return lhs; + } + + KOKKOS_FUNCTION + half_t friend operator*(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val *= rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); +#endif + return lhs; + } + + KOKKOS_FUNCTION + half_t friend operator/(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val /= rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); +#endif + return lhs; + } + + // Logical operators + KOKKOS_FUNCTION + bool operator!() const { +#ifdef __CUDA_ARCH__ + return static_cast(!val); +#else + return !__half2float(val); +#endif + } + + // NOTE: Loses short-circuit evaluation + KOKKOS_FUNCTION + bool operator&&(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val && rhs.val); +#else + return __half2float(val) && __half2float(rhs.val); +#endif + } + + // NOTE: Loses short-circuit evaluation + KOKKOS_FUNCTION + bool operator||(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val || rhs.val); +#else + return __half2float(val) || __half2float(rhs.val); +#endif + } + + // Comparison operators + KOKKOS_FUNCTION + bool operator==(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val == rhs.val); +#else + return __half2float(val) == __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator!=(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val != rhs.val); +#else + return __half2float(val) != __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator<(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val < rhs.val); +#else + return __half2float(val) < __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator>(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val > rhs.val); +#else + return __half2float(val) > __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator<=(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val <= rhs.val); +#else + return __half2float(val) <= __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator>=(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast(val >= rhs.val); +#else + return __half2float(val) >= __half2float(rhs.val); +#endif + } +}; + +// CUDA before 11.1 only has the half <-> float conversions marked host device +// So we will largely convert to float on the host for conversion +// But still call the correct functions on the device +#if (CUDA_VERSION < 11100) + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(half_t val) { return val; } + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val) { return half_t(__float2half(val)); } + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(bool val) { return cast_to_half(static_cast(val)); } + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val) { + // double2half was only introduced in CUDA 11 too + return half_t(__float2half(static_cast(val))); +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val) { +#ifdef __CUDA_ARCH__ + return half_t(__short2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val) { +#ifdef __CUDA_ARCH__ + return half_t(__ushort2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val) { +#ifdef __CUDA_ARCH__ + return half_t(__int2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val) { +#ifdef __CUDA_ARCH__ + return half_t(__uint2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val) { +#ifdef __CUDA_ARCH__ + return half_t(__ll2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val) { +#ifdef __CUDA_ARCH__ + return half_t(__ull2half_rn(val)); +#else + return half_t(__float2half(static_cast(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val) { + return cast_to_half(static_cast(val)); +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val) { + return cast_to_half(static_cast(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2float(half_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return static_cast(cast_from_half(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return static_cast(__half2float(half_t::impl_type(val))); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2short_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} + +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2ushort_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2int_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2uint_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2ll_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} + +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2ull_rz(half_t::impl_type(val)); +#else + return static_cast(__half2float(half_t::impl_type(val))); +#endif +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return static_cast(cast_from_half(val)); +} + +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { + return static_cast(cast_from_half(val)); +} + +#else // CUDA 11.1 versions follow + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val) { return __float2half(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val) { return __double2half(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val) { return __short2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val) { return __ushort2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val) { return __int2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val) { return __uint2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val) { return __ll2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val) { return __ull2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val) { + return cast_to_half(static_cast(val)); +} +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val) { + return cast_to_half(static_cast(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2float(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2double(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2short_rz(val); +} +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { + return __half2ushort_rz(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2int_rz(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2uint_rz(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return __half2ll_rz(val); +} +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { + return __half2ull_rz(val); +} +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> +cast_from_half(half_t val) { + return static_cast(cast_from_half(val)); +} +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t::value, T> + cast_from_half(half_t val) { + return static_cast(cast_from_half(val)); +} +#endif +} // namespace Experimental +} // namespace Kokkos +#endif // KOKKOS_IMPL_HALF_TYPE_DEFINED +#endif // KOKKOS_ENABLE_CUDA +#endif // Disables for half_t on cuda: + // Clang/8||KEPLER30||KEPLER32||KEPLER37||MAXWELL50||MAXWELL52 +#endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 3e5042a593..b8e8163458 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -132,7 +132,7 @@ int cuda_kernel_arch() { bool cuda_launch_blocking() { const char *env = getenv("CUDA_LAUNCH_BLOCKING"); - if (env == 0) return false; + if (env == nullptr) return false; return std::stoi(env); } @@ -509,14 +509,14 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { const char *env_force_device_alloc = getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC"); bool force_device_alloc; - if (env_force_device_alloc == 0) + if (env_force_device_alloc == nullptr) force_device_alloc = false; else force_device_alloc = std::stoi(env_force_device_alloc) != 0; const char *env_visible_devices = getenv("CUDA_VISIBLE_DEVICES"); bool visible_devices_one = true; - if (env_visible_devices == 0) visible_devices_one = false; + if (env_visible_devices == nullptr) visible_devices_one = false; if (Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc)) { @@ -893,6 +893,92 @@ const cudaDeviceProp &Cuda::cuda_device_prop() const { return m_space_instance->m_deviceProp; } +namespace Impl { + +int get_gpu(const InitArguments &args); + +int g_cuda_space_factory_initialized = + initialize_space_factory("150_Cuda"); + +void CudaSpaceInitializer::initialize(const InitArguments &args) { + int use_gpu = get_gpu(args); + if (std::is_same::value || + 0 < use_gpu) { + if (use_gpu > -1) { + Kokkos::Cuda::impl_initialize(Kokkos::Cuda::SelectDevice(use_gpu)); + } else { + Kokkos::Cuda::impl_initialize(); + } + } +} + +void CudaSpaceInitializer::finalize(bool all_spaces) { + if ((std::is_same::value || + all_spaces) && + Kokkos::Cuda::impl_is_initialized()) { + Kokkos::Cuda::impl_finalize(); + } +} + +void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); } + +void CudaSpaceInitializer::print_configuration(std::ostream &msg, + const bool detail) { + msg << "Device Execution Space:" << std::endl; + msg << " KOKKOS_ENABLE_CUDA: "; + msg << "yes" << std::endl; + + msg << "Cuda Atomics:" << std::endl; + msg << " KOKKOS_ENABLE_CUDA_ATOMICS: "; +#ifdef KOKKOS_ENABLE_CUDA_ATOMICS + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + + msg << "Cuda Options:" << std::endl; + msg << " KOKKOS_ENABLE_CUDA_LAMBDA: "; +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + msg << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; +#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + msg << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + msg << " KOKKOS_ENABLE_CUDA_UVM: "; +#ifdef KOKKOS_ENABLE_CUDA_UVM + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + msg << " KOKKOS_ENABLE_CUSPARSE: "; +#ifdef KOKKOS_ENABLE_CUSPARSE + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + msg << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + + msg << "\nCuda Runtime Configuration:" << std::endl; + Cuda::print_configuration(msg, detail); +} +} // namespace Impl + } // namespace Kokkos namespace Kokkos { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 56f3f71794..13773d70c5 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -34,7 +34,9 @@ struct CudaTraits { enum : CudaSpace::size_type { KernelArgumentLimit = 0x001000 /* 4k bytes */ }; - + enum : CudaSpace::size_type { + MaxHierarchicalParallelism = 1024 /* team_size * vector_length */ + }; using ConstantGlobalBufferType = unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index dfd179c79c..3ac2edf732 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -48,20 +48,23 @@ #include #ifdef KOKKOS_ENABLE_CUDA +#include #include #include +#include #include #include #include #include #include #include +#include +#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#if defined(__CUDACC__) - /** \brief Access to constant memory on the device */ #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE @@ -140,29 +143,85 @@ __global__ __launch_bounds__( driver->operator()(); } -template -__global__ static void cuda_parallel_launch_constant_or_global_memory( - const DriverType* driver_ptr) { - const DriverType& driver = - driver_ptr != nullptr - ? *driver_ptr - : *((const DriverType*)kokkos_impl_cuda_constant_memory_buffer); +//============================================================================== +// {{{1 - driver(); +inline bool is_empty_launch(dim3 const& grid, dim3 const& block) { + return (grid.x == 0) || ((block.x * block.y * block.z) == 0); } -template -__global__ -__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_constant_or_global_memory( - const DriverType* driver_ptr) { - const DriverType& driver = - driver_ptr != nullptr - ? *driver_ptr - : *((const DriverType*)kokkos_impl_cuda_constant_memory_buffer); - - driver(); +inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { + if (cuda_instance->m_maxShmemPerBlock < shmem) { + Kokkos::Impl::throw_runtime_exception( + std::string("CudaParallelLaunch (or graph node creation) FAILED: shared" + " memory request is too large")); + } } +template +inline void configure_shmem_preference(KernelFuncPtr const& func, + bool prefer_shmem) { +#ifndef KOKKOS_ARCH_KEPLER + // On Kepler the L1 has no benefit since it doesn't cache reads + auto set_cache_config = [&] { + CUDA_SAFE_CALL(cudaFuncSetCacheConfig( + func, + (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1))); + return prefer_shmem; + }; + static bool cache_config_preference_cached = set_cache_config(); + if (cache_config_preference_cached != prefer_shmem) { + cache_config_preference_cached = set_cache_config(); + } +#else + // Use the parameters so we don't get a warning + (void)func; + (void)prefer_shmem; +#endif +} + +template +std::enable_if_t +modify_launch_configuration_if_desired_occupancy_is_specified( + Policy const& policy, cudaDeviceProp const& properties, + cudaFuncAttributes const& attributes, dim3 const& block, int& shmem, + bool& prefer_shmem) { + int const block_size = block.x * block.y * block.z; + int const desired_occupancy = policy.impl_get_desired_occupancy().value(); + + size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties); + size_t const static_shmem = attributes.sharedSizeBytes; + + // round to nearest integer and avoid division by zero + int active_blocks = std::max( + 1, static_cast(std::round( + static_cast(properties.maxThreadsPerMultiProcessor) / + block_size * desired_occupancy / 100))); + int const dynamic_shmem = + shmem_per_sm_prefer_l1 / active_blocks - static_shmem; + + if (dynamic_shmem > shmem) { + shmem = dynamic_shmem; + prefer_shmem = false; + } +} + +template +std::enable_if_t +modify_launch_configuration_if_desired_occupancy_is_specified( + Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&, + dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {} + +// end Some helper functions for launch code readability }}}1 +//============================================================================== + +//============================================================================== +// {{{2 + +// Use local memory up to ConstantMemoryUseThreshold +// Use global memory above ConstantMemoryUsage +// In between use ConstantMemory + template struct DeduceCudaLaunchMechanism { constexpr static const Kokkos::Experimental::WorkItemProperty:: @@ -217,408 +276,362 @@ struct DeduceCudaLaunchMechanism { : Experimental::CudaLaunchMechanism::GlobalMemory) : (default_launch_mechanism)); }; -// Use local memory up to ConstantMemoryUseThreshold -// Use global memory above ConstantMemoryUsage -// In between use ConstantMemory -template , - Experimental::CudaLaunchMechanism LaunchMechanism = - DeduceCudaLaunchMechanism::launch_mechanism> -struct CudaParallelLaunch; + +// end DeduceCudaLaunchMechanism }}}2 +//============================================================================== + +//============================================================================== +// {{{1 + +// Base classes that summarize the differences between the different launch +// mechanisms + +template +struct CudaParallelLaunchKernelFunc; + +template +struct CudaParallelLaunchKernelInvoker; + +//------------------------------------------------------------------------------ +// {{{2 template -struct CudaParallelLaunch< - DriverType, Kokkos::LaunchBounds, - Experimental::CudaLaunchMechanism::ConstantMemory> { - static_assert(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage, - "Kokkos Error: Requested CudaLaunchConstantMemory with a " - "Functor larger than 32kB."); - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - const CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_constant_memory< - DriverType, MaxThreadsPerBlock, MinBlocksPerSM>, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - - // Wait until the previous kernel that uses the constant buffer is done - CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable)); - - // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; - memcpy(staging, &driver, sizeof(DriverType)); - - // Copy functor asynchronously from there to constant memory on the device - cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging, - sizeof(DriverType), 0, cudaMemcpyHostToDevice, - cudaStream_t(cuda_instance->m_stream)); - - // Invoke the driver function on the device - cuda_parallel_launch_constant_memory - <<m_stream>>>(); - - // Record an event that says when the constant buffer can be reused - CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable, - cudaStream_t(cuda_instance->m_stream))); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); -#endif - } - } - - static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, - cuda_parallel_launch_constant_memory)); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; - } -}; - -template -struct CudaParallelLaunch, - Experimental::CudaLaunchMechanism::ConstantMemory> { - static_assert(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage, - "Kokkos Error: Requested CudaLaunchConstantMemory with a " - "Functor larger than 32kB."); - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - const CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_constant_memory, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - - // Wait until the previous kernel that uses the constant buffer is done - CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable)); - - // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; - memcpy(staging, &driver, sizeof(DriverType)); - - // Copy functor asynchronously from there to constant memory on the device - cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging, - sizeof(DriverType), 0, cudaMemcpyHostToDevice, - cudaStream_t(cuda_instance->m_stream)); - - // Invoke the driver function on the device - cuda_parallel_launch_constant_memory - <<m_stream>>>(); - - // Record an event that says when the constant buffer can be reused - CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable, - cudaStream_t(cuda_instance->m_stream))); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); -#endif - } - } - - static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, cuda_parallel_launch_constant_memory)); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; - } -}; - -template -struct CudaParallelLaunch< +struct CudaParallelLaunchKernelFunc< DriverType, Kokkos::LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory> { - static_assert(sizeof(DriverType) < CudaTraits::KernelArgumentLimit, - "Kokkos Error: Requested CudaLaunchLocalMemory with a Functor " - "larger than 4096 bytes."); - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - const CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_local_memory, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - - // Invoke the driver function on the device - cuda_parallel_launch_local_memory - <<m_stream>>>(driver); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); -#endif - } - } - - static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, - cuda_parallel_launch_local_memory)); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; + static std::decay_t)> + get_kernel_func() { + return cuda_parallel_launch_local_memory; } }; template -struct CudaParallelLaunch, - Experimental::CudaLaunchMechanism::LocalMemory> { +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<0, 0>, + Experimental::CudaLaunchMechanism::LocalMemory> { + static std::decay_t)> + get_kernel_func() { + return cuda_parallel_launch_local_memory; + } +}; + +//------------------------------------------------------------------------------ + +template +struct CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory> + : CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::LocalMemory> { + using base_t = CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory>; static_assert(sizeof(DriverType) < CudaTraits::KernelArgumentLimit, "Kokkos Error: Requested CudaLaunchLocalMemory with a Functor " "larger than 4096 bytes."); - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - const CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_local_memory, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); + static void invoke_kernel(DriverType const& driver, dim3 const& grid, + dim3 const& block, int shmem, + CudaInternal const* cuda_instance) { + (base_t:: + get_kernel_func())<<m_stream>>>( + driver); + } - // Invoke the driver function on the device - cuda_parallel_launch_local_memory - <<m_stream>>>(driver); +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, + CudaInternal const* cuda_instance, bool prefer_shmem) { + //---------------------------------------- + auto const& graph = Impl::get_cuda_graph_from_kernel(driver); + KOKKOS_EXPECTS(bool(graph)); + auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver); + // Expect node not yet initialized + KOKKOS_EXPECTS(!bool(graph_node)); -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); -#endif + if (!Impl::is_empty_launch(grid, block)) { + Impl::check_shmem_request(cuda_instance, shmem); + Impl::configure_shmem_preference(base_t::get_kernel_func(), prefer_shmem); + + void const* args[] = {&driver}; + + cudaKernelNodeParams params = {}; + + params.blockDim = block; + params.gridDim = grid; + params.sharedMemBytes = shmem; + params.func = (void*)base_t::get_kernel_func(); + params.kernelParams = (void**)args; + params.extra = nullptr; + + CUDA_SAFE_CALL(cudaGraphAddKernelNode( + &graph_node, graph, /* dependencies = */ nullptr, + /* numDependencies = */ 0, ¶ms)); + } else { + // We still need an empty node for the dependency structure + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); } + KOKKOS_ENSURES(bool(graph_node)) } - - static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, cuda_parallel_launch_local_memory)); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; - } +#endif }; +// end local memory }}}2 +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// {{{2 + template -struct CudaParallelLaunch< +struct CudaParallelLaunchKernelFunc< DriverType, Kokkos::LaunchBounds, Experimental::CudaLaunchMechanism::GlobalMemory> { - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_global_memory, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - - DriverType* driver_ptr = nullptr; - driver_ptr = reinterpret_cast( - cuda_instance->scratch_functor(sizeof(DriverType))); - cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), - cudaMemcpyDefault, cuda_instance->m_stream); - - // Invoke the driver function on the device - cuda_parallel_launch_global_memory - <<m_stream>>>(driver_ptr); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); -#endif - } - } - static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, - cuda_parallel_launch_global_memory)); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; + static void* get_kernel_func() { + return cuda_parallel_launch_global_memory; } }; template -struct CudaParallelLaunch, - Experimental::CudaLaunchMechanism::GlobalMemory> { - inline CudaParallelLaunch(const DriverType& driver, const dim3& grid, - const dim3& block, const int shmem, - CudaInternal* cuda_instance, - const bool prefer_shmem) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "CudaParallelLaunch FAILED: shared memory request is too large")); - } -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - else { - static bool cache_config_set = false; - if (!cache_config_set) { - CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - cuda_parallel_launch_global_memory, - (prefer_shmem ? cudaFuncCachePreferShared - : cudaFuncCachePreferL1))); - cache_config_set = true; - } - } -#else - (void)prefer_shmem; -#endif +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<0, 0>, + Experimental::CudaLaunchMechanism::GlobalMemory> { + static std::decay_t)> + get_kernel_func() { + return cuda_parallel_launch_global_memory; + } +}; - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); +//------------------------------------------------------------------------------ - DriverType* driver_ptr = nullptr; - driver_ptr = reinterpret_cast( - cuda_instance->scratch_functor(sizeof(DriverType))); +template +struct CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::GlobalMemory> + : CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::GlobalMemory> { + using base_t = CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::GlobalMemory>; + + static void invoke_kernel(DriverType const& driver, dim3 const& grid, + dim3 const& block, int shmem, + CudaInternal const* cuda_instance) { + DriverType* driver_ptr = reinterpret_cast( + cuda_instance->scratch_functor(sizeof(DriverType))); + + cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault, + cuda_instance->m_stream); + (base_t:: + get_kernel_func())<<m_stream>>>( + driver_ptr); + } + +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, + CudaInternal const* cuda_instance, bool prefer_shmem) { + //---------------------------------------- + auto const& graph = Impl::get_cuda_graph_from_kernel(driver); + KOKKOS_EXPECTS(bool(graph)); + auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver); + // Expect node not yet initialized + KOKKOS_EXPECTS(!bool(graph_node)); + + if (!Impl::is_empty_launch(grid, block)) { + Impl::check_shmem_request(cuda_instance, shmem); + Impl::configure_shmem_preference(base_t::get_kernel_func(), prefer_shmem); + + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + + // Unlike in the non-graph case, we can get away with doing an async copy + // here because the `DriverType` instance is held in the GraphNodeImpl + // which is guaranteed to be alive until the graph instance itself is + // destroyed, where there should be a fence ensuring that the allocation + // associated with this kernel on the device side isn't deleted. cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault, cuda_instance->m_stream); - cuda_parallel_launch_global_memory - <<m_stream>>>(driver_ptr); + void const* args[] = {&driver_ptr}; + + cudaKernelNodeParams params = {}; + + params.blockDim = block; + params.gridDim = grid; + params.sharedMemBytes = shmem; + params.func = (void*)base_t::get_kernel_func(); + params.kernelParams = (void**)args; + params.extra = nullptr; + + CUDA_SAFE_CALL(cudaGraphAddKernelNode( + &graph_node, graph, /* dependencies = */ nullptr, + /* numDependencies = */ 0, ¶ms)); + } else { + // We still need an empty node for the dependency structure + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + } + KOKKOS_ENSURES(bool(graph_node)) + } +#endif +}; + +// end Global Memory }}}2 +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// {{{2 + +template +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds, + Experimental::CudaLaunchMechanism::ConstantMemory> { + static std::decay_t)> + get_kernel_func() { + return cuda_parallel_launch_constant_memory; + } +}; + +template +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<0, 0>, + Experimental::CudaLaunchMechanism::ConstantMemory> { + static std::decay_t< + decltype(cuda_parallel_launch_constant_memory)> + get_kernel_func() { + return cuda_parallel_launch_constant_memory; + } +}; + +//------------------------------------------------------------------------------ + +template +struct CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::ConstantMemory> + : CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::ConstantMemory> { + using base_t = CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::ConstantMemory>; + static_assert(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage, + "Kokkos Error: Requested CudaLaunchConstantMemory with a " + "Functor larger than 32kB."); + + static void invoke_kernel(DriverType const& driver, dim3 const& grid, + dim3 const& block, int shmem, + CudaInternal const* cuda_instance) { + // Wait until the previous kernel that uses the constant buffer is done + CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable)); + + // Copy functor (synchronously) to staging buffer in pinned host memory + unsigned long* staging = cuda_instance->constantMemHostStaging; + memcpy(staging, &driver, sizeof(DriverType)); + + // Copy functor asynchronously from there to constant memory on the device + cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging, + sizeof(DriverType), 0, cudaMemcpyHostToDevice, + cudaStream_t(cuda_instance->m_stream)); + + // Invoke the driver function on the device + (base_t:: + get_kernel_func())<<m_stream>>>(); + + // Record an event that says when the constant buffer can be reused + CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable, + cudaStream_t(cuda_instance->m_stream))); + } + +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, + CudaInternal const* cuda_instance, bool prefer_shmem) { + // Just use global memory; coordinating through events to share constant + // memory with the non-graph interface is not really reasonable since + // events don't work with Graphs directly, and this would anyway require + // a much more complicated structure that finds previous nodes in the + // dependency structure of the graph and creates an implicit dependence + // based on the need for constant memory (which we would then have to + // somehow go and prove was not creating a dependency cycle, and I don't + // even know if there's an efficient way to do that, let alone in the + // structure we currenty have). + using global_launch_impl_t = CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::GlobalMemory>; + global_launch_impl_t::create_parallel_launch_graph_node( + driver, grid, block, shmem, cuda_instance, prefer_shmem); + } +#endif +}; + +// end Constant Memory }}}2 +//------------------------------------------------------------------------------ + +// end CudaParallelLaunchKernelInvoker }}}1 +//============================================================================== + +//============================================================================== +// {{{1 + +template +struct CudaParallelLaunchImpl; + +template +struct CudaParallelLaunchImpl< + DriverType, Kokkos::LaunchBounds, + LaunchMechanism> + : CudaParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds, + LaunchMechanism> { + using base_t = CudaParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds, + LaunchMechanism>; + + inline static void launch_kernel(const DriverType& driver, const dim3& grid, + const dim3& block, int shmem, + const CudaInternal* cuda_instance, + bool prefer_shmem) { + if (!Impl::is_empty_launch(grid, block)) { + // Prevent multiple threads to simultaneously set the cache configuration + // preference and launch the same kernel + static std::mutex mutex; + std::lock_guard lock(mutex); + + Impl::check_shmem_request(cuda_instance, shmem); + + // If a desired occupancy is specified, we compute how much shared memory + // to ask for to achieve that occupancy, assuming that the cache + // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic + // shared memory computed is actually smaller than `shmem` we overwrite + // `shmem` and set `prefer_shmem` to `false`. + modify_launch_configuration_if_desired_occupancy_is_specified( + driver.get_policy(), cuda_instance->m_deviceProp, + get_cuda_func_attributes(), block, shmem, prefer_shmem); + + Impl::configure_shmem_preference(base_t::get_kernel_func(), prefer_shmem); + + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); + + // Invoke the driver function on the device + base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) CUDA_SAFE_CALL(cudaGetLastError()); - Kokkos::Cuda().fence(); + cuda_instance->fence(); #endif } } @@ -630,15 +643,63 @@ struct CudaParallelLaunch, // the code and the result is visible. auto wrap_get_attributes = []() -> cudaFuncAttributes { cudaFuncAttributes attr_tmp; - CUDA_SAFE_CALL(cudaFuncGetAttributes( - &attr_tmp, cuda_parallel_launch_global_memory)); + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func())); return attr_tmp; }; static cudaFuncAttributes attr = wrap_get_attributes(); return attr; } }; -//---------------------------------------------------------------------------- + +// end CudaParallelLaunchImpl }}}1 +//============================================================================== + +//============================================================================== +// {{{1 + +template , + Experimental::CudaLaunchMechanism LaunchMechanism = + DeduceCudaLaunchMechanism::launch_mechanism, + bool DoGraph = DriverType::Policy::is_graph_kernel::value +#ifndef KOKKOS_CUDA_ENABLE_GRAPHS + && false +#endif + > +struct CudaParallelLaunch; + +// General launch mechanism +template +struct CudaParallelLaunch + : CudaParallelLaunchImpl { + using base_t = + CudaParallelLaunchImpl; + template + CudaParallelLaunch(Args&&... args) { + base_t::launch_kernel((Args &&) args...); + } +}; + +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS +// Launch mechanism for creating graph nodes +template +struct CudaParallelLaunch + : CudaParallelLaunchImpl { + using base_t = + CudaParallelLaunchImpl; + template + CudaParallelLaunch(Args&&... args) { + base_t::create_parallel_launch_graph_node((Args &&) args...); + } +}; +#endif + +// end CudaParallelLaunch }}}1 +//============================================================================== } // namespace Impl } // namespace Kokkos @@ -646,6 +707,5 @@ struct CudaParallelLaunch, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#endif /* defined( __CUDACC__ ) */ #endif /* defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDAEXEC_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp index 07dadb3c16..ff31649544 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp @@ -42,13 +42,10 @@ //@HEADER */ -#include - +#include #ifdef KOKKOS_ENABLE_CUDA - #include #include -#include #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE namespace Kokkos { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp index a4b5d08ccf..7640b8084d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp @@ -81,8 +81,6 @@ void finalize_host_cuda_lock_arrays(); } // namespace Impl } // namespace Kokkos -#if defined(__CUDACC__) - namespace Kokkos { namespace Impl { @@ -173,8 +171,6 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; } KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() #endif -#endif /* defined( __CUDACC__ ) */ - #endif /* defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index 5dd644746b..131d180980 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -46,7 +46,7 @@ #define KOKKOS_CUDA_PARALLEL_HPP #include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include #include @@ -99,6 +99,8 @@ class TeamPolicyInternal int m_team_scratch_size[2]; int m_thread_scratch_size[2]; int m_chunk_size; + bool m_tune_team; + bool m_tune_vector; public: //! Execution space of this execution policy @@ -115,6 +117,8 @@ class TeamPolicyInternal m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; m_chunk_size = p.m_chunk_size; m_space = p.m_space; + m_tune_team = p.m_tune_team; + m_tune_vector = p.m_tune_vector; } //---------------------------------------- @@ -130,10 +134,10 @@ class TeamPolicyInternal Kokkos::Impl::cuda_get_max_block_size( space().impl_internal_space_instance(), attr, f, - (size_t)vector_length(), + (size_t)impl_vector_length(), (size_t)team_scratch_size(0) + 2 * sizeof(double), (size_t)thread_scratch_size(0) + sizeof(double)); - return block_size / vector_length(); + return block_size / impl_vector_length(); } template @@ -171,10 +175,10 @@ class TeamPolicyInternal Kokkos::Impl::cuda_get_opt_block_size( space().impl_internal_space_instance(), attr, f, - (size_t)vector_length(), + (size_t)impl_vector_length(), (size_t)team_scratch_size(0) + 2 * sizeof(double), (size_t)thread_scratch_size(0) + sizeof(double)); - return block_size / vector_length(); + return block_size / impl_vector_length(); } template @@ -234,9 +238,18 @@ class TeamPolicyInternal //---------------------------------------- - inline int vector_length() const { return m_vector_length; } + KOKKOS_DEPRECATED inline int vector_length() const { + return impl_vector_length(); + } + inline int impl_vector_length() const { return m_vector_length; } inline int team_size() const { return m_team_size; } inline int league_size() const { return m_league_size; } + inline bool impl_auto_team_size() const { return m_tune_team; } + inline bool impl_auto_vector_length() const { return m_tune_vector; } + inline void impl_set_team_size(size_t team_size) { m_team_size = team_size; } + inline void impl_set_vector_length(size_t vector_length) { + m_vector_length = vector_length; + } inline int scratch_size(int level, int team_size_ = -1) const { if (team_size_ < 0) team_size_ = m_team_size; return m_team_scratch_size[level] + @@ -258,18 +271,25 @@ class TeamPolicyInternal m_vector_length(0), m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, - m_chunk_size(32) {} + m_chunk_size(Impl::CudaTraits::WarpSize), + m_tune_team(false), + m_tune_vector(false) {} - /** \brief Specify league size, request team size */ + /** \brief Specify league size, specify team size, specify vector length */ TeamPolicyInternal(const execution_space space_, int league_size_, int team_size_request, int vector_length_request = 1) : m_space(space_), m_league_size(league_size_), m_team_size(team_size_request), - m_vector_length(verify_requested_vector_length(vector_length_request)), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : verify_requested_vector_length(1)), m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, - m_chunk_size(32) { + m_chunk_size(Impl::CudaTraits::WarpSize), + m_tune_team(bool(team_size_request <= 0)), + m_tune_vector(bool(vector_length_request <= 0)) { // Make sure league size is permissible if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) Impl::throw_runtime_exception( @@ -277,72 +297,56 @@ class TeamPolicyInternal "space."); // Make sure total block size is permissible - if (m_team_size * m_vector_length > 1024) { + if (m_team_size * m_vector_length > + int(Impl::CudaTraits::MaxHierarchicalParallelism)) { Impl::throw_runtime_exception( std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. " "Team size x vector length must be smaller than 1024.")); } } - /** \brief Specify league size, request team size */ + /** \brief Specify league size, request team size, specify vector length */ TeamPolicyInternal(const execution_space space_, int league_size_, const Kokkos::AUTO_t& /* team_size_request */ , int vector_length_request = 1) - : m_space(space_), - m_league_size(league_size_), - m_team_size(-1), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(32) { - // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on Cuda execution " - "space."); - } + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + + /** \brief Specify league size, request team size and vector length */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) {} + + /** \brief Specify league size, specify team size, request vector length */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, const Kokkos::AUTO_t&) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) {} TeamPolicyInternal(int league_size_, int team_size_request, int vector_length_request = 1) - : m_space(typename traits::execution_space()), - m_league_size(league_size_), - m_team_size(team_size_request), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(32) { - // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on Cuda execution " - "space."); + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} - // Make sure total block size is permissible - if (m_team_size * m_vector_length > 1024) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); - } - } - - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */ - , + TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request, int vector_length_request = 1) - : m_space(typename traits::execution_space()), - m_league_size(league_size_), - m_team_size(-1), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(32) { - // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on Cuda execution " - "space."); - } + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) + + {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request, + const Kokkos::AUTO_t& vector_length_request) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& vector_length_request) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} inline int chunk_size() const { return m_chunk_size; } @@ -394,7 +398,7 @@ class TeamPolicyInternal get_cuda_func_attributes(); const int block_size = std::forward(block_size_callable)( space().impl_internal_space_instance(), attr, f, - (size_t)vector_length(), + (size_t)impl_vector_length(), (size_t)team_scratch_size(0) + 2 * sizeof(double), (size_t)thread_scratch_size(0) + sizeof(double) + ((functor_value_traits::StaticValueSize != 0) @@ -406,7 +410,7 @@ class TeamPolicyInternal int p2 = 1; while (p2 <= block_size) p2 *= 2; p2 /= 2; - return p2 / vector_length(); + return p2 / impl_vector_length(); } template @@ -468,6 +472,8 @@ class ParallelFor, Kokkos::Cuda> { public: using functor_type = FunctorType; + Policy const& get_policy() const { return m_policy; } + inline __device__ void operator()(void) const { const Member work_stride = blockDim.y * gridDim.x; const Member work_end = m_policy.end(); @@ -518,7 +524,8 @@ class ParallelFor, Kokkos::Cuda> { template class ParallelFor, Kokkos::Cuda> { public: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using functor_type = FunctorType; private: using RP = Policy; @@ -530,10 +537,11 @@ class ParallelFor, Kokkos::Cuda> { const Policy m_rp; public: + Policy const& get_policy() const { return m_rp; } + inline __device__ void operator()(void) const { - Kokkos::Impl::Refactor::DeviceIterateTile( - m_rp, m_functor) + Kokkos::Impl::DeviceIterateTile(m_rp, m_functor) .exec_range(); } @@ -621,8 +629,7 @@ class ParallelFor, Kokkos::Cuda> { *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), false); } else { - printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); - Kokkos::abort("Aborting"); + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); } } // end execute @@ -636,7 +643,7 @@ template class ParallelFor, Kokkos::Cuda> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; private: using Member = typename Policy::member_type; @@ -680,6 +687,8 @@ class ParallelFor, } public: + Policy const& get_policy() const { return m_policy; } + __device__ inline void operator()(void) const { // Iterate this block through the league int64_t threadid = 0; @@ -749,7 +758,7 @@ class ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { cudaFuncAttributes attr = CudaParallelLaunch::get_cuda_func_attributes(); @@ -796,10 +805,10 @@ class ParallelFor, if (int(m_team_size) > int(Kokkos::Impl::cuda_get_max_block_size( m_policy.space().impl_internal_space_instance(), attr, - arg_functor, arg_policy.vector_length(), + arg_functor, arg_policy.impl_vector_length(), arg_policy.team_scratch_size(0), arg_policy.thread_scratch_size(0)) / - arg_policy.vector_length())) { + arg_policy.impl_vector_length())) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); } @@ -847,6 +856,7 @@ class ParallelReduce, ReducerType, using functor_type = FunctorType; using size_type = Kokkos::Cuda::size_type; using index_type = typename Policy::index_type; + using reducer_type = ReducerType; // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 @@ -873,6 +883,8 @@ class ParallelReduce, ReducerType, using DummySHMEMReductionType = int; public: + Policy const& get_policy() const { return m_policy; } + // Make the exec_range calls call to Reduce::DeviceIterateTile template __device__ inline @@ -949,36 +961,44 @@ class ParallelReduce, ReducerType, for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { global[i] = shared[i]; } - } else if (cuda_single_inter_block_reduce_scan( - ReducerConditional::select(m_functor, m_reducer), blockIdx.x, - gridDim.x, kokkos_impl_cuda_shared_memory(), - m_scratch_space, m_scratch_flags)) { - // This is the final block with the final result at the final threads' - // location + // return ; + } - size_type* const shared = kokkos_impl_cuda_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = - m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : (m_unified_space ? m_unified_space : m_scratch_space); + if (m_policy.begin() != m_policy.end()) { + { + if (cuda_single_inter_block_reduce_scan( + ReducerConditional::select(m_functor, m_reducer), blockIdx.x, + gridDim.x, kokkos_impl_cuda_shared_memory(), + m_scratch_space, m_scratch_flags)) { + // This is the final block with the final result at the final threads' + // location - if (threadIdx.y == 0) { - Kokkos::Impl::FunctorFinal::final( - ReducerConditional::select(m_functor, m_reducer), shared); - } + size_type* const shared = + kokkos_impl_cuda_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : (m_unified_space ? m_unified_space : m_scratch_space); - if (CudaTraits::WarpSize < word_count.value) { - __syncthreads(); - } + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; + i += blockDim.y) { + global[i] = shared[i]; + } + } } } } - /* __device__ inline void run(const DummyShflReductionType&) const { @@ -1055,6 +1075,9 @@ class ParallelReduce, ReducerType, const bool need_device_set = ReduceFunctorHasInit::value || ReduceFunctorHasFinal::value || !m_result_ptr_host_accessible || +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + Policy::is_graph_kernel::value || +#endif !std::is_same::value; if ((nwork > 0) || need_device_set) { const int block_size = local_block_size(m_functor); @@ -1077,6 +1100,7 @@ class ParallelReduce, ReducerType, dim3 grid(std::min(int(block.y), int((nwork + block.y - 1) / block.y)), 1, 1); + // TODO @graph We need to effectively insert this in to the graph const int shmem = UseShflReduction ? 0 @@ -1117,6 +1141,7 @@ class ParallelReduce, ReducerType, } } else { if (m_result_ptr) { + // TODO @graph We need to effectively insert this in to the graph ValueInit::init(ReducerConditional::select(m_functor, m_reducer), m_result_ptr); } @@ -1195,6 +1220,7 @@ class ParallelReduce, ReducerType, using reference_type = typename ValueTraits::reference_type; using functor_type = FunctorType; using size_type = Cuda::size_type; + using reducer_type = ReducerType; // Algorithmic constraints: blockSize is a power of two AND blockDim.y == // blockDim.z == 1 @@ -1214,16 +1240,16 @@ class ParallelReduce, ReducerType, // Shall we use the shfl based reduction or not (only use it for static sized // types of more than 128bit - enum { - UseShflReduction = ((sizeof(value_type) > 2 * sizeof(double)) && - (ValueTraits::StaticValueSize != 0)) - }; + static constexpr bool UseShflReduction = false; + //((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) // Some crutch to do function overloading private: using DummyShflReductionType = double; using DummySHMEMReductionType = int; public: + Policy const& get_policy() const { return m_policy; } + inline __device__ void exec_range(reference_type update) const { Kokkos::Impl::Reduce::DeviceIterateTile, ReducerType, // Required grid.x <= block.y const dim3 grid(std::min(int(block.y), int(nwork)), 1, 1); + // TODO @graph We need to effectively insert this in to the graph const int shmem = UseShflReduction ? 0 @@ -1403,7 +1430,7 @@ class ParallelReduce, ReducerType, false); // copy to device and execute if (!m_result_ptr_device_accessible) { - Cuda().fence(); + m_policy.space().fence(); if (m_result_ptr) { if (m_unified_space) { @@ -1421,6 +1448,7 @@ class ParallelReduce, ReducerType, } } else { if (m_result_ptr) { + // TODO @graph We need to effectively insert this in to the graph ValueInit::init(ReducerConditional::select(m_functor, m_reducer), m_result_ptr); } @@ -1464,7 +1492,7 @@ template class ParallelReduce, ReducerType, Kokkos::Cuda> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; private: using Member = typename Policy::member_type; @@ -1491,8 +1519,11 @@ class ParallelReduce, public: using functor_type = FunctorType; using size_type = Cuda::size_type; + using reducer_type = ReducerType; - enum { UseShflReduction = (true && (ValueTraits::StaticValueSize != 0)) }; + enum : bool { + UseShflReduction = (true && (ValueTraits::StaticValueSize != 0)) + }; private: using DummyShflReductionType = double; @@ -1539,6 +1570,8 @@ class ParallelReduce, } public: + Policy const& get_policy() const { return m_policy; } + __device__ inline void operator()() const { int64_t threadid = 0; if (m_scratch_size[1] > 0) { @@ -1631,31 +1664,35 @@ class ParallelReduce, for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { global[i] = shared[i]; } - } else if (cuda_single_inter_block_reduce_scan( - ReducerConditional::select(m_functor, m_reducer), blockIdx.x, - gridDim.x, kokkos_impl_cuda_shared_memory(), - m_scratch_space, m_scratch_flags)) { - // This is the final block with the final result at the final threads' - // location + } - size_type* const shared = kokkos_impl_cuda_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = - m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : (m_unified_space ? m_unified_space : m_scratch_space); + if (m_league_size != 0) { + if (cuda_single_inter_block_reduce_scan( + ReducerConditional::select(m_functor, m_reducer), blockIdx.x, + gridDim.x, kokkos_impl_cuda_shared_memory(), + m_scratch_space, m_scratch_flags)) { + // This is the final block with the final result at the final threads' + // location - if (threadIdx.y == 0) { - Kokkos::Impl::FunctorFinal::final( - ReducerConditional::select(m_functor, m_reducer), shared); - } + size_type* const shared = kokkos_impl_cuda_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : (m_unified_space ? m_unified_space : m_scratch_space); - if (CudaTraits::WarpSize < word_count.value) { - __syncthreads(); - } + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } } } } @@ -1717,6 +1754,9 @@ class ParallelReduce, const bool need_device_set = ReduceFunctorHasInit::value || ReduceFunctorHasFinal::value || !m_result_ptr_host_accessible || +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + Policy::is_graph_kernel::value || +#endif !std::is_same::value; if ((nwork > 0) || need_device_set) { const int block_count = @@ -1770,6 +1810,7 @@ class ParallelReduce, } } else { if (m_result_ptr) { + // TODO @graph We need to effectively insert this in to the graph ValueInit::init(ReducerConditional::select(m_functor, m_reducer), m_result_ptr); } @@ -1800,7 +1841,7 @@ class ParallelReduce, m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { cudaFuncAttributes attr = CudaParallelLaunch::get_cuda_func_attributes(); @@ -1838,7 +1879,7 @@ class ParallelReduce, // The global parallel_reduce does not support vector_length other than 1 at // the moment - if ((arg_policy.vector_length() > 1) && !UseShflReduction) + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " "greater than 1 is not currently supported for CUDA for dynamic " @@ -1899,7 +1940,7 @@ class ParallelReduce, m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { cudaFuncAttributes attr = CudaParallelLaunch::get_cuda_func_attributes(); @@ -1936,7 +1977,7 @@ class ParallelReduce, // The global parallel_reduce does not support vector_length other than 1 at // the moment - if ((arg_policy.vector_length() > 1) && !UseShflReduction) + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " "greater than 1 is not currently supported for CUDA for dynamic " @@ -2150,6 +2191,8 @@ class ParallelScan, Kokkos::Cuda> { } public: + Policy const& get_policy() const { return m_policy; } + //---------------------------------------- __device__ inline void operator()(void) const { @@ -2440,6 +2483,8 @@ class ParallelScanWithTotal, } public: + Policy const& get_policy() const { return m_policy; } + //---------------------------------------- __device__ inline void operator()(void) const { @@ -2799,5 +2844,5 @@ struct ParallelReduceFunctorType { } // namespace Kokkos -#endif /* defined( __CUDACC__ ) */ +#endif /* defined(KOKKOS_ENABLE_CUDA) */ #endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 6989431907..fc9fc3770b 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -46,7 +46,7 @@ #define KOKKOS_CUDA_REDUCESCAN_HPP #include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include @@ -983,5 +983,5 @@ inline unsigned cuda_single_inter_block_reduce_scan_shmem( //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#endif /* #if defined( __CUDACC__ ) */ +#endif /* #if defined(KOKKOS_ENABLE_CUDA) */ #endif /* KOKKOS_CUDA_REDUCESCAN_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index 6ead5197ee..2004edbeac 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -390,7 +390,7 @@ class TaskQueueSpecializationConstrained< ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32); ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32); -#if defined(KOKKOS_DEBUG) +#if defined(KOKKOS_ENABLE_DEBUG) KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr"); #endif @@ -799,7 +799,6 @@ namespace Kokkos { * i=0..N-1. * * The range i=0..N-1 is mapped to all threads of the the calling thread team. - * This functionality requires C++11 support. */ template KOKKOS_INLINE_FUNCTION void parallel_for( diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index 1160336519..4b472f5d4f 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -50,7 +50,7 @@ #include /* only compile this file if CUDA is enabled for Kokkos */ -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include #include @@ -290,7 +290,7 @@ class CudaTeamMember { */ template KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { - return this->template team_scan(value, 0); + return this->template team_scan(value, nullptr); } //---------------------------------------- @@ -935,6 +935,54 @@ KOKKOS_INLINE_FUNCTION //---------------------------------------------------------------------------- +/** \brief Inter-thread parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to each rank in the team (whose global rank is + * less than N) and a scan operation is performed. The last call to closure has + * final == true. + */ +// This is the same code as in HIP and largely the same as in OpenMPTarget +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct& + loop_bounds, + const FunctorType& lambda) { + // Extract value_type from lambda + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, + FunctorType>::value_type; + + const auto start = loop_bounds.start; + const auto end = loop_bounds.end; + auto& member = loop_bounds.member; + const auto team_size = member.team_size(); + const auto team_rank = member.team_rank(); + const auto nchunk = (end - start + team_size - 1) / team_size; + value_type accum = 0; + // each team has to process one or more chunks of the prefix scan + for (iType i = 0; i < nchunk; ++i) { + auto ii = start + i * team_size + team_rank; + // local accumulation for this chunk + value_type local_accum = 0; + // user updates value with prefix value + if (ii < loop_bounds.end) lambda(ii, local_accum, false); + // perform team scan + local_accum = member.team_scan(local_accum); + // add this blocks accum to total accumulation + auto val = accum + local_accum; + // user updates their data with total accumulation + if (ii < loop_bounds.end) lambda(ii, val, true); + // the last value needs to be propogated to next chunk + if (team_rank == team_size - 1) accum = val; + // broadcast last value to rest of the team + member.team_broadcast(accum, team_size - 1); + } +} + +//---------------------------------------------------------------------------- + /** \brief Intra-thread vector parallel exclusive prefix sum. * * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) @@ -1089,6 +1137,6 @@ KOKKOS_INLINE_FUNCTION void single( } // namespace Kokkos -#endif /* defined( __CUDACC__ ) */ +#endif /* defined(KOKKOS_ENABLE_CUDA) */ #endif /* #ifndef KOKKOS_CUDA_TEAM_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index b7c81b92f8..05876a9f02 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -77,6 +77,8 @@ class ParallelFor, } public: + Policy const& get_policy() const { return m_policy; } + __device__ inline void operator()() const noexcept { if (0 == (threadIdx.y % 16)) { // Spin until COMPLETED_TOKEN. diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp index f3cf25efef..c0daa274f8 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp @@ -48,7 +48,7 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- #include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include @@ -97,5 +97,5 @@ __device__ inline void cuda_abort(const char *const message) { } // namespace Kokkos #else void KOKKOS_CORE_SRC_CUDA_ABORT_PREVENT_LINK_ERROR() {} -#endif /* #if defined(__CUDACC__) && defined( KOKKOS_ENABLE_CUDA ) */ +#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */ diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp index fea5a55f64..263ba97d73 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp @@ -45,6 +45,10 @@ #ifndef KOKKOS_HIP_ATOMIC_HPP #define KOKKOS_HIP_ATOMIC_HPP +#include +#include +#include + #if defined(KOKKOS_ENABLE_HIP_ATOMICS) namespace Kokkos { // HIP can do: @@ -103,19 +107,16 @@ atomic_exchange(volatile T *const dest, typename std::enable_if::type &val) { - // FIXME_HIP - Kokkos::abort("atomic_exchange not implemented for large types.\n"); T return_val; int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; while (active != done_active) { if (!done) { - // if (Impl::lock_address_hip_space((void*)dest)) - { + if (Impl::lock_address_hip_space((void *)dest)) { return_val = *dest; *dest = val; - // Impl::unlock_address_hip_space((void*)dest); + Impl::unlock_address_hip_space((void *)dest); done = 1; } } @@ -215,19 +216,16 @@ __inline__ __device__ T atomic_compare_exchange( typename std::enable_if::type &val) { - // FIXME_HIP - Kokkos::abort("atomic_compare_exchange not implemented for large types.\n"); T return_val; int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; while (active != done_active) { if (!done) { - // if (Impl::lock_address_hip_space((void*)dest)) - { + if (Impl::lock_address_hip_space((void *)dest)) { return_val = *dest; if (return_val == compare) *dest = val; - // Impl::unlock_address_hip_space((void*)dest); + Impl::unlock_address_hip_space((void *)dest); done = 1; } } @@ -350,19 +348,16 @@ atomic_fetch_add(volatile T *dest, typename std::enable_if::type val) { - // FIXME_HIP - Kokkos::abort("atomic_fetch_add not implemented for large types.\n"); T return_val; int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; while (active != done_active) { if (!done) { - // if(Kokkos::Impl::lock_address_hip_space((void *)dest)) - { + if (Kokkos::Impl::lock_address_hip_space((void *)dest)) { return_val = *dest; *dest = return_val + val; - // Kokkos::Impl::unlock_address_hip_space((void *)dest); + Kokkos::Impl::unlock_address_hip_space((void *)dest); done = 1; } } @@ -513,19 +508,16 @@ atomic_fetch_sub(volatile T *const dest, typename std::enable_if::type &val) { - // FIXME_HIP - Kokkos::abort("atomic_fetch_sub not implemented for large types.\n"); T return_val; int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; while (active != done_active) { if (!done) { - /*if (Impl::lock_address_hip_space((void*)dest)) */ - { + if (Impl::lock_address_hip_space((void *)dest)) { return_val = *dest; *dest = return_val - val; - // Impl::unlock_address_hip_space((void*)dest); + Impl::unlock_address_hip_space((void *)dest); done = 1; } } @@ -569,6 +561,62 @@ __inline__ __device__ unsigned long long int atomic_fetch_and( unsigned long long int const val) { return atomicAnd(const_cast(dest), val); } + +namespace Impl { + +template +__inline__ __device__ void _atomic_store(T *ptr, T val, + memory_order_relaxed_t) { + (void)atomic_exchange(ptr, val); +} + +template +__inline__ __device__ void _atomic_store(T *ptr, T val, + memory_order_seq_cst_t) { + memory_fence(); + atomic_store(ptr, val, memory_order_relaxed); + memory_fence(); +} + +template +__inline__ __device__ void _atomic_store(T *ptr, T val, + memory_order_release_t) { + memory_fence(); + atomic_store(ptr, val, memory_order_relaxed); +} + +template +__inline__ __device__ void _atomic_store(T *ptr, T val) { + atomic_store(ptr, val, memory_order_relaxed); +} + +template +__inline__ __device__ T _atomic_load(T *ptr, memory_order_relaxed_t) { + T dummy{}; + return atomic_compare_exchange(ptr, dummy, dummy); +} + +template +__inline__ __device__ T _atomic_load(T *ptr, memory_order_seq_cst_t) { + memory_fence(); + T rv = atomic_load(ptr, memory_order_relaxed); + memory_fence(); + return rv; +} + +template +__inline__ __device__ T _atomic_load(T *ptr, memory_order_acquire_t) { + T rv = atomic_load(ptr, memory_order_relaxed); + memory_fence(); + return rv; +} + +template +__inline__ __device__ T _atomic_load(T *ptr) { + return atomic_load(ptr, memory_order_relaxed); +} + +} // namespace Impl } // namespace Kokkos #endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index fc4716d2a8..89135b6c45 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -55,6 +55,26 @@ namespace Kokkos { namespace Experimental { namespace Impl { + +template +void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { + // FIXME_HIP - currently the "constant" path is unimplemented. + // we should look at whether it's functional, and + // perform some simple scaling studies to see when / + // if the constant launcher outperforms the current + // pass by pointer shared launcher + HIP_SAFE_CALL(hipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, + hip_parallel_launch_local_memory, + blockSize, sharedmem)); +} + +template +void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { + hipOccupancy( + numBlocks, blockSize, sharedmem); +} template struct HIPGetMaxBlockSize; @@ -78,31 +98,26 @@ int hip_internal_get_block_size(const F &condition_check, const int min_blocks_per_sm = LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM; const int max_threads_per_block = LaunchBounds::maxTperB == 0 - ? hip_instance->m_maxThreadsPerBlock + ? HIPTraits::MaxThreadsPerBlock : LaunchBounds::maxTperB; - const int regs_per_wavefront = attr.numRegs; + const int regs_per_wavefront = std::max(attr.numRegs, 1); const int regs_per_sm = hip_instance->m_regsPerSM; const int shmem_per_sm = hip_instance->m_shmemPerSM; const int max_shmem_per_block = hip_instance->m_maxShmemPerBlock; const int max_blocks_per_sm = hip_instance->m_maxBlocksPerSM; const int max_threads_per_sm = hip_instance->m_maxThreadsPerSM; -// FIXME_HIP this is broken in 3.5, but should be in 3.6 -#if (HIP_VERSION_MAJOR > 3 || HIP_VERSION_MINOR > 5 || \ - HIP_VERSION_PATCH >= 20226) - int block_size = std::min(attr.maxThreadsPerBlock, max_threads_per_block); -#else int block_size = max_threads_per_block; -#endif KOKKOS_ASSERT(block_size > 0); + const int blocks_per_warp = + (block_size + HIPTraits::WarpSize - 1) / HIPTraits::WarpSize; int functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize::value( f, block_size / vector_length); int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) + functor_shmem + attr.sharedSizeBytes; - int max_blocks_regs = - regs_per_sm / (regs_per_wavefront * (block_size / HIPTraits::WarpSize)); + int max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp); int max_blocks_shmem = (total_shmem < max_shmem_per_block) ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs) @@ -113,7 +128,8 @@ int hip_internal_get_block_size(const F &condition_check, blocks_per_sm = max_threads_per_sm / block_size; threads_per_sm = blocks_per_sm * block_size; } - int opt_block_size = (blocks_per_sm >= min_blocks_per_sm) ? block_size : 0; + int opt_block_size = + (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm; int opt_threads_per_sm = threads_per_sm; // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i // Achieved: %i %i Opt: %i %i\n",block_size, @@ -126,8 +142,7 @@ int hip_internal_get_block_size(const F &condition_check, f, block_size / vector_length); total_shmem = shmem_block + shmem_thread * (block_size / vector_length) + functor_shmem + attr.sharedSizeBytes; - max_blocks_regs = - regs_per_sm / (regs_per_wavefront * (block_size / HIPTraits::WarpSize)); + max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp); max_blocks_shmem = (total_shmem < max_shmem_per_block) ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs) @@ -163,28 +178,21 @@ int hip_get_max_block_size(const HIPInternal *hip_instance, [](int x) { return x == 0; }, hip_instance, attr, f, vector_length, shmem_block, shmem_thread); } -template -struct HIPGetMaxBlockSize, true> { +template +struct HIPGetMaxBlockSize { static int get_block_size(typename DriverType::functor_type const &f, size_t const vector_length, size_t const shmem_extra_block, size_t const shmem_extra_thread) { -// FIXME_HIP -- remove this once the API change becomes mature -#if !defined(__HIP__) - using blocktype = unsigned int; -#else - using blocktype = int; -#endif - blocktype numBlocks = 0; - int blockSize = 1024; + int numBlocks = 0; + int blockSize = LaunchBounds::maxTperB == 0 ? 1024 : LaunchBounds::maxTperB; int sharedmem = shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + ::Kokkos::Impl::FunctorTeamShmemSize< typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, hip_parallel_launch_constant_memory, blockSize, - sharedmem); + + hipOccupancy(&numBlocks, blockSize, sharedmem); if (numBlocks > 0) return blockSize; while (blockSize > HIPTraits::WarpSize && numBlocks == 0) { @@ -195,9 +203,7 @@ struct HIPGetMaxBlockSize, true> { typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, hip_parallel_launch_constant_memory, - blockSize, sharedmem); + hipOccupancy(&numBlocks, blockSize, sharedmem); } int blockSizeUpperBound = blockSize * 2; while (blockSize < blockSizeUpperBound && numBlocks > 0) { @@ -208,9 +214,7 @@ struct HIPGetMaxBlockSize, true> { typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, hip_parallel_launch_constant_memory, - blockSize, sharedmem); + hipOccupancy(&numBlocks, blockSize, sharedmem); } return blockSize - HIPTraits::WarpSize; } @@ -255,7 +259,7 @@ struct HIPGetOptBlockSize, true> { int maxOccupancy = 0; int bestBlockSize = 0; - while (blockSize < 1024) { + while (blockSize < HIPTraits::MaxThreadsPerBlock) { blockSize *= 2; // calculate the occupancy with that optBlockSize and check whether its @@ -265,9 +269,7 @@ struct HIPGetOptBlockSize, true> { ::Kokkos::Impl::FunctorTeamShmemSize< typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, hip_parallel_launch_constant_memory, - blockSize, sharedmem); + hipOccupancy(&numBlocks, blockSize, sharedmem); if (maxOccupancy < numBlocks * blockSize) { maxOccupancy = numBlocks * blockSize; bestBlockSize = blockSize; @@ -289,7 +291,7 @@ struct HIPGetOptBlockSize, false> { int maxOccupancy = 0; int bestBlockSize = 0; - while (blockSize < 1024) { + while (blockSize < HIPTraits::MaxThreadsPerBlock) { blockSize *= 2; sharedmem = shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + @@ -297,9 +299,7 @@ struct HIPGetOptBlockSize, false> { typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, hip_parallel_launch_local_memory, blockSize, - sharedmem); + hipOccupancy(&numBlocks, blockSize, sharedmem); if (maxOccupancy < numBlocks * blockSize) { maxOccupancy = numBlocks * blockSize; @@ -340,11 +340,8 @@ struct HIPGetOptBlockSize< ::Kokkos::Impl::FunctorTeamShmemSize< typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, - hip_parallel_launch_constant_memory, - blockSize, sharedmem); + hipOccupancy( + &numBlocks, blockSize, sharedmem); if (numBlocks >= static_cast(MinBlocksPerSM) && blockSize <= static_cast(MaxThreadsPerBlock)) { if (maxOccupancy < numBlocks * blockSize) { @@ -384,11 +381,8 @@ struct HIPGetOptBlockSize< typename DriverType::functor_type>::value(f, blockSize / vector_length); - hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, - hip_parallel_launch_local_memory, - blockSize, sharedmem); + hipOccupancy( + &numBlocks, blockSize, sharedmem); if (numBlocks >= int(MinBlocksPerSM) && blockSize <= int(MaxThreadsPerBlock)) { if (maxOccupancy < numBlocks * blockSize) { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp index 2abded0e99..b3480bcad0 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp @@ -56,10 +56,10 @@ namespace Kokkos { namespace Impl { void hip_internal_error_throw(hipError_t e, const char* name, - const char* file = NULL, const int line = 0); + const char* file = nullptr, const int line = 0); inline void hip_internal_safe_call(hipError_t e, const char* name, - const char* file = NULL, + const char* file = nullptr, const int line = 0) { if (hipSuccess != e) { hip_internal_error_throw(e, name, file, line); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 20af48bf6f..45512038ac 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -114,7 +114,7 @@ void HIPInternal::print_configuration(std::ostream &s) const { << (dev_info.m_hipProp[i].major) << "." << dev_info.m_hipProp[i].minor << ", Total Global Memory: " << ::Kokkos::Impl::human_memory_size(dev_info.m_hipProp[i].totalGlobalMem) - << ", Shared Memory per Wavefront: " + << ", Shared Memory per Block: " << ::Kokkos::Impl::human_memory_size( dev_info.m_hipProp[i].sharedMemPerBlock); if (m_hipDev == i) s << " : Selected"; @@ -140,10 +140,10 @@ HIPInternal::~HIPInternal() { m_maxShmemPerBlock = 0; m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; - m_scratchSpace = 0; - m_scratchFlags = 0; + m_scratchSpace = nullptr; + m_scratchFlags = nullptr; m_scratchConcurrentBitset = nullptr; - m_stream = 0; + m_stream = nullptr; } int HIPInternal::verify_is_initialized(const char *const label) const { @@ -183,7 +183,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { const HIPInternalDevices &dev_info = HIPInternalDevices::singleton(); - const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags; + const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags; // Need at least a GPU device const bool ok_id = @@ -195,9 +195,11 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { m_hipDev = hip_device_id; m_deviceProp = hipProp; - hipSetDevice(m_hipDev); + HIP_SAFE_CALL(hipSetDevice(m_hipDev)); - m_stream = stream; + m_stream = stream; + m_team_scratch_current_size = 0; + m_team_scratch_ptr = nullptr; // number of multiprocessors m_multiProcCount = hipProp.multiProcessorCount; @@ -216,14 +218,19 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { m_maxBlock = hipProp.maxGridSize[0]; // theoretically, we can get 40 WF's / CU, but only can sustain 32 + // see + // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742 m_maxBlocksPerSM = 32; // FIXME_HIP - Nick to implement this upstream - m_regsPerSM = 262144 / 32; - m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; - m_maxShmemPerBlock = hipProp.sharedMemPerBlock; - m_maxThreadsPerSM = m_maxBlocksPerSM * HIPTraits::WarpSize; - m_maxThreadsPerBlock = hipProp.maxThreadsPerBlock; - + // Register count comes from Sec. 2.2. "Data Sharing" of the + // Vega 7nm ISA document (see the diagram) + // https://developer.amd.com/wp-content/resources/Vega_7nm_Shader_ISA.pdf + // VGPRS = 4 (SIMD/CU) * 256 VGPR/SIMD * 64 registers / VGPR = + // 65536 VGPR/CU + m_regsPerSM = 65536; + m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; + m_maxShmemPerBlock = hipProp.sharedMemPerBlock; + m_maxThreadsPerSM = m_maxBlocksPerSM * HIPTraits::WarpSize; //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. @@ -277,8 +284,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { } // Init the array for used for arbitrarily sized atomics - // FIXME_HIP uncomment this when global variable works - // if (m_stream == 0) ::Kokkos::Impl::initialize_host_hip_lock_arrays(); + if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays(); } //---------------------------------------------------------------------------- @@ -327,18 +333,35 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags( m_scratchFlags = reinterpret_cast(r->data()); - hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain); + HIP_SAFE_CALL( + hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain)); } return m_scratchFlags; } +void *HIPInternal::resize_team_scratch_space(std::int64_t bytes, + bool force_shrink) { + if (m_team_scratch_current_size == 0) { + m_team_scratch_current_size = bytes; + m_team_scratch_ptr = Kokkos::kokkos_malloc( + "HIPSpace::ScratchMemory", m_team_scratch_current_size); + } + if ((bytes > m_team_scratch_current_size) || + ((bytes < m_team_scratch_current_size) && (force_shrink))) { + m_team_scratch_current_size = bytes; + m_team_scratch_ptr = Kokkos::kokkos_realloc( + m_team_scratch_ptr, m_team_scratch_current_size); + } + return m_team_scratch_ptr; +} + //---------------------------------------------------------------------------- void HIPInternal::finalize() { - HIP().fence(); + this->fence(); was_finalized = true; - if (0 != m_scratchSpace || 0 != m_scratchFlags) { + if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { using RecordHIP = Kokkos::Impl::SharedAllocationRecord; @@ -346,19 +369,24 @@ void HIPInternal::finalize() { RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace)); RecordHIP::decrement(RecordHIP::get_record(m_scratchConcurrentBitset)); - m_hipDev = -1; - m_hipArch = -1; - m_multiProcCount = 0; - m_maxWarpCount = 0; - m_maxBlock = 0; - m_maxSharedWords = 0; - m_maxShmemPerBlock = 0; - m_scratchSpaceCount = 0; - m_scratchFlagsCount = 0; - m_scratchSpace = 0; - m_scratchFlags = 0; - m_scratchConcurrentBitset = nullptr; - m_stream = 0; + if (m_team_scratch_current_size > 0) + Kokkos::kokkos_free(m_team_scratch_ptr); + + m_hipDev = -1; + m_hipArch = -1; + m_multiProcCount = 0; + m_maxWarpCount = 0; + m_maxBlock = 0; + m_maxSharedWords = 0; + m_maxShmemPerBlock = 0; + m_scratchSpaceCount = 0; + m_scratchFlagsCount = 0; + m_scratchSpace = nullptr; + m_scratchFlags = nullptr; + m_scratchConcurrentBitset = nullptr; + m_stream = nullptr; + m_team_scratch_current_size = 0; + m_team_scratch_ptr = nullptr; } } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 9688aef350..07ec8625e6 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -57,6 +57,8 @@ struct HIPTraits { static int constexpr WarpSize = 64; static int constexpr WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static int constexpr WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ + static int constexpr MaxThreadsPerBlock = + 1024; // FIXME_HIP -- assumed constant for now static int constexpr ConstantMemoryUsage = 0x008000; /* 32k bytes */ static int constexpr ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */ @@ -92,9 +94,11 @@ class HIPInternal { int m_shmemPerSM; int m_maxShmemPerBlock; int m_maxThreadsPerSM; - int m_maxThreadsPerBlock; + + // Scratch Spaces for Reductions size_type m_scratchSpaceCount; size_type m_scratchFlagsCount; + size_type *m_scratchSpace; size_type *m_scratchFlags; uint32_t *m_scratchConcurrentBitset = nullptr; @@ -103,6 +107,10 @@ class HIPInternal { hipStream_t m_stream; + // Team Scratch Level 1 Space + mutable int64_t m_team_scratch_current_size; + mutable void *m_team_scratch_ptr; + bool was_finalized = false; static HIPInternal &singleton(); @@ -113,7 +121,7 @@ class HIPInternal { return m_hipDev >= 0; } // 0 != m_scratchSpace && 0 != m_scratchFlags ; } - void initialize(int hip_device_id, hipStream_t stream = 0); + void initialize(int hip_device_id, hipStream_t stream = nullptr); void finalize(); void print_configuration(std::ostream &) const; @@ -132,15 +140,21 @@ class HIPInternal { m_shmemPerSM(0), m_maxShmemPerBlock(0), m_maxThreadsPerSM(0), - m_maxThreadsPerBlock(0), m_scratchSpaceCount(0), m_scratchFlagsCount(0), - m_scratchSpace(0), - m_scratchFlags(0), - m_stream(0) {} + m_scratchSpace(nullptr), + m_scratchFlags(nullptr), + m_stream(nullptr), + m_team_scratch_current_size(0), + m_team_scratch_ptr(nullptr) {} + // Resizing of reduction related scratch spaces size_type *scratch_space(const size_type size); size_type *scratch_flags(const size_type size); + + // Resizing of team level 1 scratch + void *resize_team_scratch_space(std::int64_t bytes, + bool force_shrink = false); }; } // namespace Impl diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 34ccd899c3..3e972c7346 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -64,7 +64,7 @@ namespace Kokkos { namespace Experimental { template inline __device__ T *kokkos_impl_hip_shared_memory() { - extern __shared__ HIPSpace::size_type sh[]; + HIP_DYNAMIC_SHARED(HIPSpace::size_type, sh); return (T *)sh; } } // namespace Experimental @@ -74,18 +74,17 @@ namespace Kokkos { namespace Experimental { namespace Impl { -void *hip_resize_scratch_space(std::int64_t bytes, bool force_shrink = false); - template __global__ static void hip_parallel_launch_constant_memory() { -// cannot use global constants in HCC -#ifdef __HCC__ - __device__ __constant__ unsigned long kokkos_impl_hip_constant_memory_buffer - [Kokkos::Experimental::Impl::HIPTraits::ConstantMemoryUsage / - sizeof(unsigned long)]; -#endif + const DriverType &driver = *(reinterpret_cast( + kokkos_impl_hip_constant_memory_buffer)); + driver(); +} - const DriverType *const driver = (reinterpret_cast( +template +__global__ __launch_bounds__( + maxTperB, minBperSM) static void hip_parallel_launch_constant_memory() { + const DriverType &driver = *(reinterpret_cast( kokkos_impl_hip_constant_memory_buffer)); driver->operator()(); @@ -147,6 +146,8 @@ struct HIPParallelLaunch< "HIPParallelLaunch FAILED: shared memory request is too large"); } + KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); + // FIXME_HIP -- there is currently an error copying (some) structs // by value to the device in HIP-Clang / VDI // As a workaround, we can malloc the DriverType and explictly copy over. @@ -169,12 +170,15 @@ struct HIPParallelLaunch< } static hipFuncAttributes get_hip_func_attributes() { - hipFuncAttributes attr; - hipFuncGetAttributes( - &attr, - reinterpret_cast( - hip_parallel_launch_local_memory)); + static hipFuncAttributes attr = []() { + hipFuncAttributes attr; + HIP_SAFE_CALL(hipFuncGetAttributes( + &attr, + reinterpret_cast( + hip_parallel_launch_local_memory))); + return attr; + }(); return attr; } }; @@ -192,6 +196,8 @@ struct HIPParallelLaunch, "HIPParallelLaunch FAILED: shared memory request is too large")); } + KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); + // Invoke the driver function on the device // FIXME_HIP -- see note about struct copy by value above @@ -212,10 +218,13 @@ struct HIPParallelLaunch, } static hipFuncAttributes get_hip_func_attributes() { - hipFuncAttributes attr; - hipFuncGetAttributes( - &attr, reinterpret_cast( - &hip_parallel_launch_local_memory)); + static hipFuncAttributes attr = []() { + hipFuncAttributes attr; + HIP_SAFE_CALL(hipFuncGetAttributes( + &attr, reinterpret_cast( + hip_parallel_launch_local_memory))); + return attr; + }(); return attr; } }; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp index 3426caafda..4f5271b6f6 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp @@ -52,26 +52,28 @@ #include +namespace Kokkos { + #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE +namespace Impl { __device__ __constant__ HIPLockArrays g_device_hip_lock_arrays = {nullptr, nullptr, 0}; +} #endif -namespace Kokkos { - namespace { __global__ void init_lock_array_kernel_atomic() { unsigned i = blockIdx.x * blockDim.x + threadIdx.x; if (i < KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1) { - g_device_hip_lock_arrays.atomic[i] = 0; + Kokkos::Impl::g_device_hip_lock_arrays.atomic[i] = 0; } } __global__ void init_lock_array_kernel_threadid(int N) { unsigned i = blockIdx.x * blockDim.x + threadIdx.x; if (i < static_cast(N)) { - g_device_hip_lock_arrays.scratch[i] = 0; + Kokkos::Impl::g_device_hip_lock_arrays.scratch[i] = 0; } } @@ -94,17 +96,17 @@ void initialize_host_hip_lock_arrays() { KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE(); init_lock_array_kernel_atomic<<< - (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256, 0, 0>>>(); + (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256, 0, nullptr>>>(); init_lock_array_kernel_threadid<<< - (::Kokkos::Experimental::HIP::concurrency() + 255) / 256, 256, 0, 0>>>( - ::Kokkos::Experimental::HIP::concurrency()); + (::Kokkos::Experimental::HIP::concurrency() + 255) / 256, 256, 0, + nullptr>>>(::Kokkos::Experimental::HIP::concurrency()); } void finalize_host_hip_lock_arrays() { if (g_host_hip_lock_arrays.atomic == nullptr) return; - hipFree(g_host_hip_lock_arrays.atomic); + HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic)); g_host_hip_lock_arrays.atomic = nullptr; - hipFree(g_host_hip_lock_arrays.scratch); + HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch)); g_host_hip_lock_arrays.scratch = nullptr; g_host_hip_lock_arrays.n = 0; #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp index fb6728ea14..f34f85f43b 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp @@ -51,7 +51,8 @@ #include -// FIXME_HIP We cannot use global variables defined in a namespace +namespace Kokkos { +namespace Impl { struct HIPLockArrays { std::int32_t* atomic; @@ -63,9 +64,6 @@ struct HIPLockArrays { /// of these arrays. extern HIPLockArrays g_host_hip_lock_arrays; -namespace Kokkos { -namespace Impl { - /// \brief After this call, the g_host_hip_lock_arrays variable has /// valid, initialized arrays. /// @@ -78,9 +76,6 @@ void initialize_host_hip_lock_arrays(); /// This call is idempotent. void finalize_host_hip_lock_arrays(); -} // namespace Impl -} // namespace Kokkos - #if defined(__HIPCC__) /// \brief This global variable in HIP space is what kernels use @@ -108,9 +103,6 @@ __device__ #define KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK 0x1FFFF -namespace Kokkos { -namespace Impl { - /// \brief Acquire a lock for the address /// /// This function tries to acquire the lock for the hash value derived @@ -152,14 +144,15 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; } /* Dan Ibanez: it is critical that this code be a macro, so that it will capture the right address for g_device_hip_lock_arrays! putting this in an inline function will NOT do the right thing! */ -#define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() \ - { \ - if (::Kokkos::Impl::lock_array_copied == 0) { \ - HIP_SAFE_CALL(hipMemcpyToSymbol(HIP_SYMBOL(g_device_hip_lock_arrays), \ - &g_host_hip_lock_arrays, \ - sizeof(HIPLockArrays))); \ - } \ - lock_array_copied = 1; \ +#define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() \ + { \ + if (::Kokkos::Impl::lock_array_copied == 0) { \ + HIP_SAFE_CALL(hipMemcpyToSymbol( \ + HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \ + &::Kokkos::Impl::g_host_hip_lock_arrays, \ + sizeof(::Kokkos::Impl::HIPLockArrays))); \ + } \ + ::Kokkos::Impl::lock_array_copied = 1; \ } #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index c3acc0622d..6b831ff7a3 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -118,9 +118,9 @@ class ParallelFor, dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], m_policy.m_tile[2], m_policy.m_tile[3]); dim3 const grid( - std::min(static_cast(m_policy.m_tile_end[0] * - m_policy.m_tile_end[1]), - static_cast(maxblocks)), + std::min(static_cast(m_policy.m_tile_end[0] * + m_policy.m_tile_end[1]), + static_cast(maxblocks)), std::min((m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / block.y, maxblocks), @@ -168,8 +168,7 @@ class ParallelFor, *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), false); } else { - printf("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); - Kokkos::abort("Aborting"); + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); } } // end execute @@ -227,17 +226,6 @@ class ParallelReduce, ReducerType, using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile< Policy::rank, Policy, FunctorType, WorkTag, reference_type>; - // Shall we use the shfl based reduction or not (only use it for static sized - // types of more than 128bit - enum { - UseShflReduction = ((sizeof(value_type) > 2 * sizeof(double)) && - (ValueTraits::StaticValueSize != 0)) - }; - // Some crutch to do function overloading - private: - using DummyShflReductionType = double; - using DummySHMEMReductionType = int; - public: inline __device__ void exec_range(reference_type update) const { DeviceIteratePattern(m_policy, m_functor, update).exec_range(); @@ -299,7 +287,8 @@ class ParallelReduce, ReducerType, // Determine block size constrained by shared memory: // This is copy/paste from Kokkos_HIP_Parallel_Range inline unsigned local_block_size(const FunctorType& f) { - unsigned n = Experimental::Impl::HIPTraits::WarpSize * 8; + unsigned int n = + ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< false, FunctorType, WorkTag>(f, n); while ( @@ -343,13 +332,13 @@ class ParallelReduce, ReducerType, // REQUIRED ( 1 , N , 1 ) const dim3 block(1, block_size, 1); // Required grid.x <= block.y - const dim3 grid(std::min(int(block.y), int(nwork)), 1, 1); + const dim3 grid(std::min(static_cast(block.y), + static_cast(nwork)), + 1, 1); const int shmem = - UseShflReduction - ? 0 - : ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< - false, FunctorType, WorkTag>(m_functor, block.y); + ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< + false, FunctorType, WorkTag>(m_functor, block.y); Kokkos::Experimental::Impl::HIPParallelLaunch( @@ -358,7 +347,7 @@ class ParallelReduce, ReducerType, false); // copy to device and execute if (!m_result_ptr_device_accessible) { - Experimental::HIP().fence(); + m_policy.space().fence(); if (m_result_ptr) { const int size = ValueTraits::value_size( @@ -379,7 +368,7 @@ class ParallelReduce, ReducerType, ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ViewType& arg_result, typename std::enable_if::value, - void*>::type = NULL) + void*>::type = nullptr) : m_functor(arg_functor), m_policy(arg_policy), m_reducer(InvalidType()), @@ -387,8 +376,8 @@ class ParallelReduce, ReducerType, m_result_ptr_device_accessible( MemorySpaceAccess::accessible), - m_scratch_space(0), - m_scratch_flags(0) {} + m_scratch_space(nullptr), + m_scratch_flags(nullptr) {} ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ReducerType& reducer) @@ -400,8 +389,8 @@ class ParallelReduce, ReducerType, MemorySpaceAccess::accessible), - m_scratch_space(0), - m_scratch_flags(0) {} + m_scratch_space(nullptr), + m_scratch_flags(nullptr) {} }; } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 6e75e1857f..5607f1c91a 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -108,7 +108,11 @@ class ParallelFor, inline void execute() const { const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); - const int block_size = 256; // FIXME_HIP Choose block_size better + const int block_size = + LaunchBounds::maxTperB + ? LaunchBounds::maxTperB + : ::Kokkos::Experimental::Impl::HIPTraits:: + MaxThreadsPerBlock; // FIXME_HIP Choose block_size better const dim3 block(1, block_size, 1); const dim3 grid( typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); @@ -321,8 +325,8 @@ class ParallelReduce, ReducerType, // Determine block size constrained by shared memory: inline unsigned local_block_size(const FunctorType& f) { - // FIXME_HIP I don't know where 8 comes from - unsigned int n = ::Kokkos::Experimental::Impl::HIPTraits::WarpSize * 8; + unsigned int n = + ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; int shmem_size = hip_single_inter_block_reduce_scan_shmem( f, n); @@ -406,7 +410,7 @@ class ParallelReduce, ReducerType, ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, const ViewType& arg_result, typename std::enable_if::value, - void*>::type = NULL) + void*>::type = nullptr) : m_functor(arg_functor), m_policy(arg_policy), m_reducer(InvalidType()), diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index 56b07f6710..5da83d289e 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -77,6 +77,8 @@ class TeamPolicyInternal int m_team_scratch_size[2]; int m_thread_scratch_size[2]; int m_chunk_size; + bool m_tune_team_size; + bool m_tune_vector_length; public: using execution_space = Kokkos::Experimental::HIP; @@ -92,6 +94,8 @@ class TeamPolicyInternal m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; m_chunk_size = p.m_chunk_size; m_space = p.m_space; + m_tune_team_size = p.m_tune_team_size; + m_tune_vector_length = p.m_tune_vector_length; } template @@ -104,10 +108,10 @@ class TeamPolicyInternal int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size< FunctorType, typename traits::launch_bounds>( space().impl_internal_space_instance(), attr, f, - static_cast(vector_length()), + static_cast(impl_vector_length()), static_cast(team_scratch_size(0)) + 2 * sizeof(double), static_cast(thread_scratch_size(0)) + sizeof(double)); - return block_size / vector_length(); + return block_size / impl_vector_length(); } template @@ -144,10 +148,10 @@ class TeamPolicyInternal int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size< FunctorType, typename traits::launch_bounds>( space().impl_internal_space_instance(), attr, f, - static_cast(vector_length()), + static_cast(impl_vector_length()), static_cast(team_scratch_size(0)) + 2 * sizeof(double), static_cast(thread_scratch_size(0)) + sizeof(double)); - return block_size / vector_length(); + return block_size / impl_vector_length(); } template @@ -173,7 +177,8 @@ class TeamPolicyInternal ReducerType>; return internal_team_size_recommended(f); } - + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline bool impl_auto_team_size() const { return m_tune_team_size; } static int vector_length_max() { return ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; } @@ -203,8 +208,10 @@ class TeamPolicyInternal level == 0 ? 1024 * 40 : // FIXME_HIP arbitrarily setting this to 48kB 20 * 1024 * 1024); // FIXME_HIP arbitrarily setting this to 20MB } - - int vector_length() const { return m_vector_length; } + inline void impl_set_vector_length(size_t size) { m_vector_length = size; } + inline void impl_set_team_size(size_t size) { m_team_size = size; } + int impl_vector_length() const { return m_vector_length; } + KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); } int team_size() const { return m_team_size; } @@ -231,7 +238,9 @@ class TeamPolicyInternal m_vector_length(0), m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, - m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize) {} + m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize), + m_tune_team_size(false), + m_tune_vector_length(false) {} /** \brief Specify league size, request team size */ TeamPolicyInternal(const execution_space space_, int league_size_, @@ -239,11 +248,16 @@ class TeamPolicyInternal : m_space(space_), m_league_size(league_size_), m_team_size(team_size_request), - m_vector_length(verify_requested_vector_length(vector_length_request)), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : (verify_requested_vector_length(1))), m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, - m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize) { - // Make sure league size is permissable + m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize), + m_tune_team_size(bool(team_size_request <= 0)), + m_tune_vector_length(bool(vector_length_request <= 0)) { + // Make sure league size is permissible if (league_size_ >= static_cast( ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count())) @@ -251,7 +265,7 @@ class TeamPolicyInternal "Requested too large league_size for TeamPolicy on HIP execution " "space."); - // Make sure total block size is permissable + // Make sure total block size is permissible if (m_team_size * m_vector_length > 1024) { Impl::throw_runtime_exception( std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " @@ -263,65 +277,56 @@ class TeamPolicyInternal TeamPolicyInternal(const execution_space space_, int league_size_, const Kokkos::AUTO_t& /* team_size_request */, int vector_length_request = 1) - : m_space(space_), - m_league_size(league_size_), - m_team_size(-1), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize) { - // Make sure league size is permissable - if (league_size_ >= - static_cast( - ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - } + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + // FLAG + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) + + {} TeamPolicyInternal(int league_size_, int team_size_request, int vector_length_request = 1) - : m_space(typename traits::execution_space()), - m_league_size(league_size_), - m_team_size(team_size_request), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize) { - // Make sure league size is permissable - if (league_size_ >= - static_cast( - ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - - // Make sure total block size is permissable - if (m_team_size * m_vector_length > 1024) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); - } - } + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& /* team_size_request */, int vector_length_request = 1) - : m_space(typename traits::execution_space()), - m_league_size(league_size_), - m_team_size(-1), - m_vector_length(verify_requested_vector_length(vector_length_request)), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize) { - // Make sure league size is permissable - if (league_size_ >= - static_cast( - ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count())) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - } + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + vector_length_request) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + -1) {} int chunk_size() const { return m_chunk_size; } @@ -370,7 +375,7 @@ class TeamPolicyInternal typename traits::launch_bounds>::get_hip_func_attributes(); const int block_size = std::forward(block_size_callable)( space().impl_internal_space_instance(), attr, f, - static_cast(vector_length()), + static_cast(impl_vector_length()), static_cast(team_scratch_size(0)) + 2 * sizeof(double), static_cast(thread_scratch_size(0)) + sizeof(double) + ((functor_value_traits::StaticValueSize != 0) @@ -382,7 +387,7 @@ class TeamPolicyInternal int p2 = 1; while (p2 <= block_size) p2 *= 2; p2 /= 2; - return p2 / vector_length(); + return p2 / impl_vector_length(); } template @@ -400,12 +405,6 @@ class TeamPolicyInternal } }; -struct HIPLockArrays { - std::int32_t* atomic = nullptr; - std::int32_t* scratch = nullptr; - std::int32_t n = 0; -}; - template class ParallelFor, Kokkos::Experimental::HIP> { @@ -434,7 +433,6 @@ class ParallelFor, int m_shmem_size; void* m_scratch_ptr[2]; int m_scratch_size[2]; - mutable HIPLockArrays hip_lock_arrays; template __device__ inline @@ -458,15 +456,19 @@ class ParallelFor, __shared__ int64_t base_thread_id; if (threadIdx.x == 0 && threadIdx.y == 0) { threadid = (blockIdx.x * blockDim.z + threadIdx.z) % - (hip_lock_arrays.n / (blockDim.x * blockDim.y)); + (Kokkos::Impl::g_device_hip_lock_arrays.n / + (blockDim.x * blockDim.y)); threadid *= blockDim.x * blockDim.y; int done = 0; while (!done) { - done = (0 == atomicCAS(&hip_lock_arrays.scratch[threadid], 0, 1)); + done = (0 == + atomicCAS( + &Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid], + 0, 1)); if (!done) { threadid += blockDim.x * blockDim.y; if (int64_t(threadid + blockDim.x * blockDim.y) >= - int64_t(hip_lock_arrays.n)) + int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n)) threadid = 0; } } @@ -490,22 +492,11 @@ class ParallelFor, if (m_scratch_size[1] > 0) { __syncthreads(); if (threadIdx.x == 0 && threadIdx.y == 0) - hip_lock_arrays.scratch[threadid] = 0; + Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid] = 0; } } inline void execute() const { - HIP_SAFE_CALL(hipMalloc( - &hip_lock_arrays.atomic, - sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1))); - HIP_SAFE_CALL(hipMalloc( - &hip_lock_arrays.scratch, - sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency()))); - HIP_SAFE_CALL(hipMemset( - hip_lock_arrays.scratch, 0, - sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency()))); - hip_lock_arrays.n = ::Kokkos::Experimental::HIP::concurrency(); - int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; dim3 const grid(static_cast(m_league_size), 1, 1); dim3 const block(static_cast(m_vector_size), @@ -515,16 +506,6 @@ class ParallelFor, *this, grid, block, shmem_size_total, m_policy.space().impl_internal_space_instance(), true); // copy to device and execute - - if (hip_lock_arrays.atomic) { - HIP_SAFE_CALL(hipFree(hip_lock_arrays.atomic)); - hip_lock_arrays.atomic = nullptr; - } - if (hip_lock_arrays.scratch) { - HIP_SAFE_CALL(hipFree(hip_lock_arrays.scratch)); - hip_lock_arrays.scratch = nullptr; - } - hip_lock_arrays.n = 0; } ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -532,7 +513,7 @@ class ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelFor, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -558,11 +539,13 @@ class ParallelFor, m_scratch_ptr[1] = m_team_size <= 0 ? nullptr - : ::Kokkos::Experimental::Impl::hip_resize_scratch_space( - static_cast(m_scratch_size[1]) * - static_cast( - ::Kokkos::Experimental::HIP::concurrency() / - (m_team_size * m_vector_size))); + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast(m_scratch_size[1]) * + static_cast( + ::Kokkos::Experimental::HIP::concurrency() / + (m_team_size * m_vector_size))); int const shmem_size_total = m_shmem_begin + m_shmem_size; if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < @@ -580,10 +563,10 @@ class ParallelFor, ::Kokkos::Experimental::Impl::hip_get_max_block_size( m_policy.space().impl_internal_space_instance(), attr, - arg_functor, arg_policy.vector_length(), + arg_functor, arg_policy.impl_vector_length(), arg_policy.team_scratch_size(0), arg_policy.thread_scratch_size(0)) / - arg_policy.vector_length())) { + arg_policy.impl_vector_length())) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); } @@ -630,8 +613,8 @@ class ParallelReduce, static int constexpr UseShflReduction = (value_traits::StaticValueSize != 0); private: - using DummyShflReductionType = double; - using DummySHMEMReductionType = int; + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; // Algorithmic constraints: blockDim.y is a power of two AND // blockDim.y == blockDim.z == 1 shared memory utilization: @@ -672,60 +655,8 @@ class ParallelReduce, m_functor(TagType(), member, update); } - public: - __device__ inline void operator()() const { - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - __shared__ int64_t base_thread_id; - // FIXME_HIP This uses g_device_hip_lock_arrays which is not working - if (threadIdx.x == 0 && threadIdx.y == 0) { - Impl::hip_abort("Error should not be here (not implemented yet)\n"); - threadid = (blockIdx.x * blockDim.z + threadIdx.z) % - (g_device_hip_lock_arrays.n / (blockDim.x * blockDim.y)); - threadid *= blockDim.x * blockDim.y; - int done = 0; - while (!done) { - done = (0 == - atomicCAS(&g_device_hip_lock_arrays.scratch[threadid], 0, 1)); - if (!done) { - threadid += blockDim.x * blockDim.y; - if (static_cast(threadid + blockDim.x * blockDim.y) >= - static_cast(g_device_hip_lock_arrays.n)) - threadid = 0; - } - } - base_thread_id = threadid; - } - __syncthreads(); - threadid = base_thread_id; - } - - run(Kokkos::Impl::if_c::select(1, 1.0), - threadid); - if (m_scratch_size[1] > 0) { - __syncthreads(); - if (threadIdx.x == 0 && threadIdx.y == 0) { - Impl::hip_abort("Error should not be here (not implemented yet)\n"); - g_device_hip_lock_arrays.scratch[threadid] = 0; - } - } - } - - __device__ inline void run(DummySHMEMReductionType const&, - int const& threadid) const { - integral_nonzero_constant const - word_count(value_traits::value_size( - reducer_conditional::select(m_functor, m_reducer)) / - sizeof(size_type)); - - reference_type value = value_init::init( - reducer_conditional::select(m_functor, m_reducer), - Kokkos::Experimental::kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); - - // Iterate this block through the league + __device__ inline void iterate_through_league(int const threadid, + reference_type value) const { int const int_league_size = static_cast(m_league_size); for (int league_rank = blockIdx.x; league_rank < int_league_size; league_rank += gridDim.x) { @@ -741,6 +672,63 @@ class ParallelReduce, m_scratch_size[1], league_rank, m_league_size), value); } + } + + public: + __device__ inline void operator()() const { + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % + (Kokkos::Impl::g_device_hip_lock_arrays.n / + (blockDim.x * blockDim.y)); + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = (0 == + atomicCAS( + &Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid], + 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (static_cast(threadid + blockDim.x * blockDim.y) >= + static_cast(Kokkos::Impl::g_device_hip_lock_arrays.n)) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + } + + using ReductionTag = std::conditional_t; + run(ReductionTag{}, threadid); + + if (m_scratch_size[1] > 0) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid] = 0; + } + } + } + + __device__ inline void run(SHMEMReductionTag, int const threadid) const { + integral_nonzero_constant const + word_count(value_traits::value_size( + reducer_conditional::select(m_functor, m_reducer)) / + sizeof(size_type)); + + reference_type value = value_init::init( + reducer_conditional::select(m_functor, m_reducer), + Kokkos::Experimental::kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value); + + // Iterate this block through the league + iterate_through_league(threadid, value); // Reduce with final value at blockDim.y - 1 location. bool do_final_reduce = (m_league_size == 0); @@ -777,28 +765,12 @@ class ParallelReduce, } } - __device__ inline void run(DummyShflReductionType const&, - int const& threadid) const { - // FIXME_HIP implementation close to the function above + __device__ inline void run(ShflReductionTag, int const threadid) const { value_type value; value_init::init(reducer_conditional::select(m_functor, m_reducer), &value); // Iterate this block through the league - int const int_league_size = static_cast(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team( - member_type( - Kokkos::Experimental::kokkos_impl_hip_shared_memory() + - m_team_begin, - m_shmem_begin, m_shmem_size, - reinterpret_cast( - reinterpret_cast(m_scratch_ptr[1]) + - static_cast(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size), - value); - } + iterate_through_league(threadid, value); pointer_type const result = m_result_ptr_device_accessible @@ -807,7 +779,7 @@ class ParallelReduce, value_type init; value_init::init(reducer_conditional::select(m_functor, m_reducer), &init); - if (int_league_size == 0) { + if (m_league_size == 0) { Kokkos::Impl::FunctorFinal::final( reducer_conditional::select(m_functor, m_reducer), reinterpret_cast(&value)); @@ -897,15 +869,15 @@ class ParallelReduce, m_result_ptr_host_accessible( MemorySpaceAccess::accessible), - m_scratch_space(0), - m_scratch_flags(0), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), m_team_begin(0), m_shmem_begin(0), m_shmem_size(0), m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelReduce, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -918,17 +890,6 @@ class ParallelReduce, m_policy.thread_scratch_size(0)) / m_vector_size; - // We can't early exit here because the result place might not be accessible - // or the functor/reducer init not callable on the host. But I am not sure - // all the other code below is kosher with zero work length ... - // - // Return Init value if the number of worksets is zero - // if (m_league_size * m_team_size == 0) { - // value_init::init(reducer_conditional::select(m_functor, m_reducer), - // arg_result.data()); - // return; - //} - m_team_begin = UseShflReduction ? 0 @@ -942,16 +903,19 @@ class ParallelReduce, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); m_scratch_ptr[1] = - m_team_size <= 0 ? nullptr - : Kokkos::Experimental::Impl::hip_resize_scratch_space( - static_cast(m_scratch_size[1]) * - (static_cast( - Kokkos::Experimental::HIP::concurrency() / - (m_team_size * m_vector_size)))); + m_team_size <= 0 + ? nullptr + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast(m_scratch_size[1]) * + (static_cast( + Kokkos::Experimental::HIP::concurrency() / + (m_team_size * m_vector_size)))); // The global parallel_reduce does not support vector_length other than 1 at // the moment - if ((arg_policy.vector_length() > 1) && !UseShflReduction) + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " "greater than 1 is not currently supported for HIP for dynamic " @@ -1004,15 +968,15 @@ class ParallelReduce, MemorySpaceAccess::accessible), - m_scratch_space(0), - m_scratch_flags(0), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), m_team_begin(0), m_shmem_begin(0), m_shmem_size(0), m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.vector_length()) { + m_vector_size(arg_policy.impl_vector_length()) { hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelReduce, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -1025,17 +989,6 @@ class ParallelReduce, m_policy.thread_scratch_size(0)) / m_vector_size; - // We can't early exit here because the result place might not be accessible - // or the functor/reducer init not callable on the host. But I am not sure - // all the other code below is kosher with zero work length ... - // - // Return Init value if the number of worksets is zero - // if (arg_policy.league_size() == 0) { - // value_init::init(reducer_conditional::select(m_functor, m_reducer), - // m_result_ptr); - // return; - //} - m_team_begin = UseShflReduction ? 0 @@ -1049,16 +1002,19 @@ class ParallelReduce, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); m_scratch_ptr[1] = - m_team_size <= 0 ? nullptr - : Kokkos::Experimental::Impl::hip_resize_scratch_space( - static_cast(m_scratch_size[1]) * - static_cast( - Kokkos::Experimental::HIP::concurrency() / - (m_team_size * m_vector_size))); + m_team_size <= 0 + ? nullptr + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast(m_scratch_size[1]) * + static_cast( + Kokkos::Experimental::HIP::concurrency() / + (m_team_size * m_vector_size))); // The global parallel_reduce does not support vector_length other than 1 at // the moment - if ((arg_policy.vector_length() > 1) && !UseShflReduction) + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " "greater than 1 is not currently supported for HIP for dynamic " diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp index cdf9cac30d..fe7c34bb80 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -81,11 +81,6 @@ __device__ inline void hip_intra_warp_shuffle_reduction( join(result, tmp); } shift *= 2; - // Not sure why there is a race condition here but we need to wait for the - // join operation to be finished to perform the next shuffle. Note that the - // problem was also found in the CUDA backend with CUDA clang - // (https://github.com/kokkos/kokkos/issues/941) - __syncthreads(); } // Broadcast the result to all the threads in the warp @@ -204,7 +199,6 @@ __device__ inline bool hip_inter_block_shuffle_reduction( value_type tmp = Kokkos::Experimental::shfl_down(value, i, warp_size); if (id + i < gridDim.x) join(value, tmp); } - __syncthreads(); } } } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index a97fb2f7cc..00cef28f82 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -64,8 +64,8 @@ namespace Impl { namespace { hipStream_t get_deep_copy_stream() { - static hipStream_t s = 0; - if (s == 0) { + static hipStream_t s = nullptr; + if (s == nullptr) { HIP_SAFE_CALL(hipStreamCreate(&s)); } return s; @@ -161,7 +161,7 @@ DeepCopy 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, ptr, - reported_size); + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; @@ -233,13 +238,19 @@ void* HIPHostPinnedSpace::allocate(const size_t arg_alloc_size) const { void* HIPHostPinnedSpace::allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void* HIPHostPinnedSpace::impl_allocate( + const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { void* ptr = nullptr; auto const error_code = hipHostMalloc(&ptr, arg_alloc_size); if (error_code != hipSuccess) { - hipGetLastError(); // This is the only way to clear the last error, which - // we should do here since we're turning it into an - // exception here + // This is the only way to clear the last error, which we should do here + // since we're turning it into an exception here + (void)hipGetLastError(); throw HIPRawMemoryAllocationFailure( arg_alloc_size, error_code, RawMemoryAllocationFailure::AllocationMechanism::HIPHostMalloc); @@ -247,9 +258,7 @@ void* HIPHostPinnedSpace::allocate(const char* arg_label, if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, ptr, - reported_size); + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); } return ptr; @@ -261,12 +270,17 @@ void HIPSpace::deallocate(void* const arg_alloc_ptr, void HIPSpace::deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void HIPSpace::impl_deallocate( + const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr, - reported_size); + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); } @@ -280,12 +294,17 @@ void HIPHostPinnedSpace::deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void HIPHostPinnedSpace::impl_deallocate( + const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData( - Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr, - reported_size); + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); } HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr)); } @@ -299,7 +318,7 @@ void HIPHostPinnedSpace::deallocate(const char* arg_label, namespace Kokkos { namespace Impl { -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord SharedAllocationRecord::s_root_record; @@ -375,7 +394,7 @@ SharedAllocationRecord:: // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function : SharedAllocationRecord( -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -407,7 +426,7 @@ SharedAllocationRecord:: // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function : SharedAllocationRecord( -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -445,7 +464,7 @@ void* SharedAllocationRecord:: void SharedAllocationRecord::deallocate_tracked(void* const arg_alloc_ptr) { - if (arg_alloc_ptr != 0) { + if (arg_alloc_ptr != nullptr) { SharedAllocationRecord* const r = get_record(arg_alloc_ptr); RecordBase::decrement(r); @@ -521,7 +540,7 @@ SharedAllocationRecord::get_record( Header head; Header const* const head_hip = - alloc_ptr ? Header::get_header(alloc_ptr) : (Header*)0; + alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; if (alloc_ptr) { Kokkos::Impl::DeepCopy( @@ -529,7 +548,7 @@ SharedAllocationRecord::get_record( } RecordHIP* const record = - alloc_ptr ? static_cast(head.m_record) : (RecordHIP*)0; + alloc_ptr ? static_cast(head.m_record) : nullptr; if (!alloc_ptr || record->m_alloc_ptr != head_hip) { Kokkos::Impl::throw_runtime_exception(std::string( @@ -561,9 +580,9 @@ SharedAllocationRecord:: - print_records(std::ostream& s, const Kokkos::Experimental::HIPSpace& space, + print_records(std::ostream& s, const Kokkos::Experimental::HIPSpace&, bool detail) { -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord* r = &s_root_record; char buffer[256]; @@ -598,7 +617,7 @@ void SharedAllocationRecord:: reinterpret_cast(r->m_alloc_ptr), r->m_alloc_size, r->m_count, reinterpret_cast(r->m_dealloc), head.m_label); - std::cout << buffer; + s << buffer; r = r->m_next; } while (r != &s_root_record); } else { @@ -622,51 +641,28 @@ void SharedAllocationRecord:: } else { snprintf(buffer, 256, "HIP [ 0 + 0 ]\n"); } - std::cout << buffer; + s << buffer; r = r->m_next; } while (r != &s_root_record); } #else (void)s; - (void)space; (void)detail; throw_runtime_exception( "Kokkos::Impl::SharedAllocationRecord::print_records" - " only works with KOKKOS_DEBUG enabled"); + " only works with KOKKOS_ENABLE_DEBUG enabled"); #endif } -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -void* hip_resize_scratch_space(size_t bytes, bool force_shrink) { - static void* ptr = NULL; - static size_t current_size = 0; - if (current_size == 0) { - current_size = bytes; - ptr = Kokkos::kokkos_malloc( - "HIPSpace::ScratchMemory", current_size); - } - if (bytes > current_size) { - current_size = bytes; - ptr = Kokkos::kokkos_realloc(ptr, - current_size); - } - if ((bytes < current_size) && (force_shrink)) { - current_size = bytes; - Kokkos::kokkos_free(ptr); - ptr = Kokkos::kokkos_malloc( - "HIPSpace::ScratchMemory", current_size); - } - return ptr; -} - } // namespace Impl } // namespace Kokkos /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ namespace Kokkos { +namespace Impl { +int get_gpu(const InitArguments& args); +} namespace Experimental { int HIP::concurrency() { @@ -760,4 +756,57 @@ hipDeviceProp_t const& HIP::hip_device_prop() { const char* HIP::name() { return "HIP"; } } // namespace Experimental + +namespace Impl { + +int g_hip_space_factory_initialized = + initialize_space_factory("150_HIP"); + +void HIPSpaceInitializer::initialize(const InitArguments& args) { + int use_gpu = Impl::get_gpu(args); + + if (std::is_same::value || + 0 < use_gpu) { + if (use_gpu > -1) { + Kokkos::Experimental::HIP::impl_initialize( + Kokkos::Experimental::HIP::SelectDevice(use_gpu)); + } else { + Kokkos::Experimental::HIP::impl_initialize(); + } + } +} + +void HIPSpaceInitializer::finalize(const bool all_spaces) { + if (std::is_same::value || + all_spaces) { + if (Kokkos::Experimental::HIP::impl_is_initialized()) + Kokkos::Experimental::HIP::impl_finalize(); + } +} + +void HIPSpaceInitializer::fence() { + Kokkos::Experimental::HIP::impl_static_fence(); +} + +void HIPSpaceInitializer::print_configuration(std::ostream& msg, + const bool detail) { + msg << "Devices:" << std::endl; + msg << " KOKKOS_ENABLE_HIP: "; + msg << "yes" << std::endl; + + msg << "HIP Options:" << std::endl; + msg << " KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE: "; +#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + + msg << "\nRuntime Configuration:" << std::endl; + Experimental::HIP::print_configuration(msg, detail); +} + +} // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index 577c392a0a..7571510c31 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -270,7 +270,7 @@ class HIPTeamMember { */ template KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { - return this->template team_scan(value, 0); + return this->template team_scan(value, nullptr); } //---------------------------------------- @@ -755,6 +755,52 @@ KOKKOS_INLINE_FUNCTION #endif } +/** \brief Inter-thread parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to each rank in the team (whose global rank is + * less than N) and a scan operation is performed. The last call to closure has + * final == true. + */ +// This is the same code as in CUDA and largely the same as in OpenMPTarget +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct& + loop_bounds, + const FunctorType& lambda) { + // Extract value_type from lambda + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, + FunctorType>::value_type; + + const auto start = loop_bounds.start; + const auto end = loop_bounds.end; + auto& member = loop_bounds.member; + const auto team_size = member.team_size(); + const auto team_rank = member.team_rank(); + const auto nchunk = (end - start + team_size - 1) / team_size; + value_type accum = 0; + // each team has to process one or more chunks of the prefix scan + for (iType i = 0; i < nchunk; ++i) { + auto ii = start + i * team_size + team_rank; + // local accumulation for this chunk + value_type local_accum = 0; + // user updates value with prefix value + if (ii < loop_bounds.end) lambda(ii, local_accum, false); + // perform team scan + local_accum = member.team_scan(local_accum); + // add this blocks accum to total accumulation + auto val = accum + local_accum; + // user updates their data with total accumulation + if (ii < loop_bounds.end) lambda(ii, val, true); + // the last value needs to be propogated to next chunk + if (team_rank == team_size - 1) accum = val; + // broadcast last value to rest of the team + member.team_broadcast(accum, team_size - 1); + } +} + template KOKKOS_INLINE_FUNCTION void parallel_for( const Impl::TeamVectorRangeBoundariesStruct& diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp index 045892bb99..c5ca89a9fd 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp @@ -128,7 +128,13 @@ struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(T& val, int lane, int width) const noexcept { - return __shfl(val, lane, width); + // FIXME_HIP Not sure why there is a race condition here. Note that the + // problem was also found in the CUDA backend with CUDA clang + // (https://github.com/kokkos/kokkos/issues/941) but it seems more limited + // in CUDA clang. + auto return_val = __shfl(val, lane, width); + __threadfence(); + return return_val; } }; @@ -141,7 +147,13 @@ struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(T& val, int lane, int width) const noexcept { - return __shfl_up(val, lane, width); + // FIXME_HIP Not sure why there is a race condition here. Note that the + // problem was also found in the CUDA backend with CUDA clang + // (https://github.com/kokkos/kokkos/issues/941) but it seems more limited + // in CUDA clang. + auto return_val = __shfl_up(val, lane, width); + __threadfence(); + return return_val; } }; @@ -155,7 +167,13 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(T& val, int lane, int width) const noexcept { - return __shfl_down(val, lane, width); + // FIXME_HIP Not sure why there is a race condition here. Note that the + // problem was also found in the CUDA backend with CUDA clang + // (https://github.com/kokkos/kokkos/issues/941) but it seems more limited + // in CUDA clang. + auto return_val = __shfl_down(val, lane, width); + __threadfence(); + return return_val; } }; diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp index c7512ff35b..910d5e52e6 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp @@ -42,7 +42,7 @@ //@HEADER */ -#include +#include #ifdef KOKKOS_ENABLE_HPX #include @@ -79,7 +79,7 @@ void HPX::impl_initialize(int thread_count) { if (rt == nullptr) { std::vector config = { "hpx.os_threads=" + std::to_string(thread_count), -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG "--hpx:attach-debugger=exception", #endif }; @@ -110,7 +110,7 @@ void HPX::impl_initialize() { hpx::runtime *rt = hpx::get_runtime_ptr(); if (rt == nullptr) { std::vector config = { -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG "--hpx:attach-debugger=exception", #endif }; @@ -153,6 +153,56 @@ void HPX::impl_finalize() { } } // namespace Experimental + +namespace Impl { + +int g_hpx_space_factory_initialized = + initialize_space_factory("060_HPX"); + +void HPXSpaceInitializer::initialize(const InitArguments &args) { + const int num_threads = args.num_threads; + + if (std::is_same::value || + std::is_same::value) { + if (num_threads > 0) { + Kokkos::Experimental::HPX::impl_initialize(num_threads); + } else { + Kokkos::Experimental::HPX::impl_initialize(); + } + // std::cout << "Kokkos::initialize() fyi: HPX enabled and initialized" << + // std::endl ; + } else { + // std::cout << "Kokkos::initialize() fyi: HPX enabled but not initialized" + // << std::endl ; + } +} + +void HPXSpaceInitializer::finalize(const bool all_spaces) { + if (std::is_same::value || + std::is_same::value || + all_spaces) { + if (Kokkos::Experimental::HPX::impl_is_initialized()) + Kokkos::Experimental::HPX::impl_finalize(); + } +} + +void HPXSpaceInitializer::fence() { Kokkos::Experimental::HPX().fence(); } + +void HPXSpaceInitializer::print_configuration(std::ostream &msg, + const bool detail) { + msg << "HPX Execution Space:" << std::endl; + msg << " KOKKOS_ENABLE_HPX: "; + msg << "yes" << std::endl; + + msg << "\nHPX Runtime Configuration:" << std::endl; + Kokkos::Experimental::HPX::print_configuration(msg, detail); +} + +} // namespace Impl } // namespace Kokkos #else diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index d3ec64368f..140376425c 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -52,19 +52,11 @@ #include #include #include +#include -#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA) -#include -#include -#endif - -#if defined(__HCC__) && defined(KOKKOS_ENABLE_ROCM) -//#include -#include -#endif - -#if defined(__HIPCC__) && defined(KOKKOS_ENABLE_HIP) -#include +#if defined(KOKKOS_ENABLE_CUDA) || \ + (defined(__HIPCC__) && defined(KOKKOS_ENABLE_HIP)) +#include #endif namespace Kokkos { @@ -83,8 +75,7 @@ enum class Iterate template struct default_outer_direction { using type = Iterate; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_ROCM) || \ - defined(KOKKOS_ENABLE_HIP) +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) static constexpr Iterate value = Iterate::Left; #else static constexpr Iterate value = Iterate::Right; @@ -94,8 +85,7 @@ struct default_outer_direction { template struct default_inner_direction { using type = Iterate; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_ROCM) || \ - defined(KOKKOS_ENABLE_HIP) +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) static constexpr Iterate value = Iterate::Left; #else static constexpr Iterate value = Iterate::Right; @@ -118,6 +108,79 @@ struct Rank { static constexpr Iterate inner_direction = InnerDir; }; +namespace Impl { +// NOTE the comparison below is encapsulated to silent warnings about pointless +// comparison of unsigned integer with zero +template +constexpr std::enable_if_t::value, bool> +is_less_than_value_initialized_variable(T) { + return false; +} + +template +constexpr std::enable_if_t::value, bool> +is_less_than_value_initialized_variable(T arg) { + return arg < T{}; +} + +// Checked narrowing conversion that calls abort if the cast changes the value +template +constexpr To checked_narrow_cast(From arg) { + constexpr const bool is_different_signedness = + (std::is_signed::value != std::is_signed::value); + auto const ret = static_cast(arg); + if (static_cast(ret) != arg || + (is_different_signedness && + is_less_than_value_initialized_variable(arg) != + is_less_than_value_initialized_variable(ret))) { + Kokkos::abort("unsafe narrowing conversion"); + } + return ret; +} +// NOTE prefer C array U[M] to std::initalizer_list so that the number of +// elements can be deduced (https://stackoverflow.com/q/40241370) +// NOTE for some unfortunate reason the policy bounds are stored as signed +// integer arrays (point_type which is Kokkos::Array) so we +// specify the index type (actual policy index_type from the traits) and check +// ahead of time that narrowing conversions will be safe. +template +constexpr Array to_array_potentially_narrowing(const U (&init)[M]) { + using T = typename Array::value_type; + Array a{}; + constexpr std::size_t N = a.size(); + static_assert(M <= N, ""); + auto* ptr = a.data(); + // NOTE equivalent to + // std::transform(std::begin(init), std::end(init), a.data(), + // [](U x) { return static_cast(x); }); + // except that std::transform is not constexpr. + for (auto x : init) { + *ptr++ = checked_narrow_cast(x); + (void)checked_narrow_cast(x); // see note above + } + return a; +} + +// NOTE Making a copy even when std::is_same>::value +// is true to reduce code complexity. You may change this if you have a good +// reason to. Intentionally not enabling std::array at this time but this may +// change too. +template +constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( + Kokkos::Array const& other) { + using T = typename NVCC_WONT_LET_ME_CALL_YOU_Array::value_type; + NVCC_WONT_LET_ME_CALL_YOU_Array a{}; + constexpr std::size_t N = a.size(); + static_assert(M <= N, ""); + for (std::size_t i = 0; i < M; ++i) { + a[i] = checked_narrow_cast(other[i]); + (void)checked_narrow_cast(other[i]); // see note above + } + return a; +} +} // namespace Impl + // multi-dimensional iteration pattern template struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { @@ -148,7 +211,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { enum { rank = static_cast(iteration_pattern::rank) }; using index_type = typename traits::index_type; - using array_index_type = long; + using array_index_type = std::int64_t; using point_type = Kokkos::Array; // was index_type using tile_type = Kokkos::Array; // If point_type or tile_type is not templated on a signed integral type (if @@ -162,12 +225,12 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { // as template parameter to the MDRangePolicy or static_cast the individual // values - point_type m_lower; - point_type m_upper; - tile_type m_tile; - point_type m_tile_end; - index_type m_num_tiles; - index_type m_prod_tile_dims; + point_type m_lower = {}; + point_type m_upper = {}; + tile_type m_tile = {}; + point_type m_tile_end = {}; + index_type m_num_tiles = 1; + index_type m_prod_tile_dims = 1; /* // NDE enum impl definition alternative - replace static constexpr int ? @@ -203,49 +266,89 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const { return m_space; } - template - MDRangePolicy(std::initializer_list const& lower, - std::initializer_list const& upper, - std::initializer_list const& tile = {}) - : m_space() { - init(lower, upper, tile); + + MDRangePolicy() = default; + + template ::value && + std::is_integral::value && + std::is_integral::value>> + MDRangePolicy(const LT (&lower)[LN], const UT (&upper)[UN], + const TT (&tile)[TN] = {}) + : MDRangePolicy( + Impl::to_array_potentially_narrowing( + lower), + Impl::to_array_potentially_narrowing( + upper), + Impl::to_array_potentially_narrowing( + tile)) { + static_assert( + LN == rank && UN == rank && TN <= rank, + "MDRangePolicy: Constructor initializer lists have wrong size"); } - template + template ::value && + std::is_integral::value && + std::is_integral::value>> MDRangePolicy(const typename traits::execution_space& work_space, - std::initializer_list const& lower, - std::initializer_list const& upper, - std::initializer_list const& tile = {}) - : m_space(work_space) { - init(lower, upper, tile); + const LT (&lower)[LN], const UT (&upper)[UN], + const TT (&tile)[TN] = {}) + : MDRangePolicy( + work_space, + Impl::to_array_potentially_narrowing( + lower), + Impl::to_array_potentially_narrowing( + upper), + Impl::to_array_potentially_narrowing( + tile)) { + static_assert( + LN == rank && UN == rank && TN <= rank, + "MDRangePolicy: Constructor initializer lists have wrong size"); } + // NOTE: Keeping these two constructor despite the templated constructors + // from Kokkos arrays for backwards compability to allow construction from + // double-braced initializer lists. MDRangePolicy(point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{}) - : m_space(), - m_lower(lower), - m_upper(upper), - m_tile(tile), - m_num_tiles(1), - m_prod_tile_dims(1) { - init(); - } + : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} MDRangePolicy(const typename traits::execution_space& work_space, point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{}) - : m_space(work_space), - m_lower(lower), - m_upper(upper), - m_tile(tile), - m_num_tiles(1), - m_prod_tile_dims(1) { + : m_space(work_space), m_lower(lower), m_upper(upper), m_tile(tile) { init(); } + template ::value>> + MDRangePolicy(Kokkos::Array const& lower, + Kokkos::Array const& upper, + Kokkos::Array const& tile = Kokkos::Array{}) + : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} + + template ::value>> + MDRangePolicy(const typename traits::execution_space& work_space, + Kokkos::Array const& lower, + Kokkos::Array const& upper, + Kokkos::Array const& tile = Kokkos::Array{}) + : MDRangePolicy( + work_space, + Impl::to_array_potentially_narrowing( + lower), + Impl::to_array_potentially_narrowing( + upper), + Impl::to_array_potentially_narrowing( + tile)) {} + template MDRangePolicy(const MDRangePolicy p) - : m_space(p.m_space), + : traits(p), // base class may contain data such as desired occupancy + m_space(p.m_space), m_lower(p.m_lower), m_upper(p.m_upper), m_tile(p.m_tile), @@ -260,165 +363,6 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { #if defined(KOKKOS_ENABLE_CUDA) && !std::is_same::value #endif -#if defined(KOKKOS_ENABLE_ROCM) - && !std::is_same::value -#endif -#if defined(KOKKOS_ENABLE_HIP) - && !std::is_same::value -#endif - ) { - index_type span; - for (int i = 0; i < rank; ++i) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - m_tile[i] = 2; - } else { - m_tile[i] = (span == 0 ? 1 : span); - } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - } -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - else // Cuda - { - index_type span; - int increment = 1; - int rank_start = 0; - int rank_end = rank; - if ((int)inner_direction == (int)Right) { - increment = -1; - rank_start = rank - 1; - rank_end = -1; - } - bool is_cuda_exec_space = -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same::value; -#else - false; -#endif - for (int i = rank_start; i != rank_end; i += increment) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for cuda and HIP - // may be rank dependent - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - if (m_prod_tile_dims < 256) { - m_tile[i] = (is_cuda_exec_space) ? 2 : 4; - } else { - m_tile[i] = 1; - } - } else { - m_tile[i] = 16; - } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - if (m_prod_tile_dims > - 1024) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 - // max per dim (Kepler), but product num_threads < 1024 - if (is_cuda_exec_space) { - printf(" Tile dimensions exceed Cuda limits\n"); - Kokkos::abort( - " Cuda ExecSpace Error: MDRange tile dims exceed maximum number " - "of " - "threads per block - choose smaller tile dims"); - } else { - printf(" Tile dimensions exceed HIP limits\n"); - Kokkos::abort( - "HIP ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); - } - } - } -#endif -#if defined(KOKKOS_ENABLE_ROCM) - else // ROCm - { - index_type span; - int increment = 1; - int rank_start = 0; - int rank_end = rank; - if ((int)inner_direction == (int)Right) { - increment = -1; - rank_start = rank - 1; - rank_end = -1; - } - for (int i = rank_start; i != rank_end; i += increment) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for rocm - // may be rank dependent - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - if (m_prod_tile_dims < 256) { - m_tile[i] = 4; - } else { - m_tile[i] = 1; - } - } else { - m_tile[i] = 16; - } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - if (m_prod_tile_dims > 1024) { // but product num_threads < 1024 - printf(" Tile dimensions exceed ROCm limits\n"); - Kokkos::abort( - " ROCm ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); - // Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: - // MDRange tile dims exceed maximum number of threads per block - choose - // smaller tile dims"); - } - } -#endif - } - - template - void init(std::initializer_list const& lower, - std::initializer_list const& upper, - std::initializer_list const& tile = {}) { - if (static_cast(m_lower.size()) != rank || - static_cast(m_upper.size()) != rank) - Kokkos::abort( - "MDRangePolicy: Constructor initializer lists have wrong size"); - - for (auto i = 0; i < rank; ++i) { - m_lower[i] = static_cast(lower.begin()[i]); - m_upper[i] = static_cast(upper.begin()[i]); - if (static_cast(tile.size()) == rank) - m_tile[i] = static_cast(tile.begin()[i]); - else - m_tile[i] = 0; - } - - m_num_tiles = 1; - m_prod_tile_dims = 1; - - // Host - if (true -#if defined(KOKKOS_ENABLE_CUDA) - && !std::is_same::value -#endif -#if defined(KOKKOS_ENABLE_ROCM) - && !std::is_same::value -#endif #if defined(KOKKOS_ENABLE_HIP) && !std::is_same::value @@ -453,15 +397,21 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { rank_start = rank - 1; rank_end = -1; } + bool is_cuda_exec_space = +#if defined(KOKKOS_ENABLE_CUDA) + std::is_same::value; +#else + false; +#endif for (int i = rank_start; i != rank_end; i += increment) { span = m_upper[i] - m_lower[i]; if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for cuda + // TODO: determine what is a good default tile size for Cuda and HIP // may be rank dependent if (((int)inner_direction == (int)Right && (i < rank - 1)) || ((int)inner_direction == (int)Left && (i > 0))) { if (m_prod_tile_dims < 256) { - m_tile[i] = 2; + m_tile[i] = (is_cuda_exec_space) ? 2 : 4; } else { m_tile[i] = 1; } @@ -477,63 +427,17 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { if (m_prod_tile_dims > 1024) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 // max per dim (Kepler), but product num_threads < 1024 -#if defined(KOKKOS_ENABLE_CUDA) - printf(" Tile dimensions exceed Cuda limits\n"); - Kokkos::abort( - " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); -#else - printf(" Tile dimensions exceed HIP limits\n"); - Kokkos::abort( - " HIP ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); -#endif - } - } -#endif -#if defined(KOKKOS_ENABLE_ROCM) - else // ROCm - { - index_type span; - int increment = 1; - int rank_start = 0; - int rank_end = rank; - if ((int)inner_direction == (int)Right) { - increment = -1; - rank_start = rank - 1; - rank_end = -1; - } - for (int i = rank_start; i != rank_end; i += increment) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for cuda - // may be rank dependent - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - if (m_prod_tile_dims < 256) { - m_tile[i] = 2; - } else { - m_tile[i] = 1; - } - } else { - m_tile[i] = 16; - } + if (is_cuda_exec_space) { + printf(" Tile dimensions exceed Cuda limits\n"); + Kokkos::abort( + "Cuda ExecSpace Error: MDRange tile dims exceed maximum number " + "of threads per block - choose smaller tile dims"); + } else { + printf(" Tile dimensions exceed HIP limits\n"); + Kokkos::abort( + "HIP ExecSpace Error: MDRange tile dims exceed maximum number of " + "threads per block - choose smaller tile dims"); } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - if (m_prod_tile_dims > - 1024) { // Match ROCm restriction for ParallelReduce; 1024,1024,1024 - // max per dim , but product num_threads < 1024 - printf(" Tile dimensions exceed ROCm limits\n"); - Kokkos::abort( - " ROCm ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); - // Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: - // MDRange tile dims exceed maximum number of threads per block - choose - // smaller tile dims"); } } #endif @@ -550,28 +454,5 @@ using Kokkos::MDRangePolicy; using Kokkos::Rank; } // namespace Experimental } // namespace Kokkos -// ------------------------------------------------------------------ // - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -template -struct PolicyPropertyAdaptor, - MDRangePolicy> { - using policy_in_t = MDRangePolicy; - using policy_out_t = - MDRangePolicy>; -}; - -} // namespace Impl -} // namespace Experimental -} // namespace Kokkos #endif // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP diff --git a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp index d4632596c8..8e226a078d 100644 --- a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -85,23 +85,23 @@ namespace Impl { template struct MemorySpaceAccess { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template struct MemorySpaceAccess { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp index 89f84ae7ce..8cd60fa6ba 100644 --- a/lib/kokkos/core/src/Kokkos_Atomic.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp @@ -73,22 +73,25 @@ #include //---------------------------------------------------------------------------- + +// Need to fix this for pure clang on windows #if defined(_WIN32) #define KOKKOS_ENABLE_WINDOWS_ATOMICS + #if defined(KOKKOS_ENABLE_CUDA) #define KOKKOS_ENABLE_CUDA_ATOMICS +#if defined(KOKKOS_COMPILER_CLANG) +#define KOKKOS_ENABLE_GNU_ATOMICS #endif -#else +#endif + +#else // _WIN32 #if defined(KOKKOS_ENABLE_CUDA) // Compiling NVIDIA device code, must use Cuda atomics: #define KOKKOS_ENABLE_CUDA_ATOMICS -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU) - -#define KOKKOS_ENABLE_ROCM_ATOMICS - #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU) || \ defined(KOKKOS_IMPL_ENABLE_OVERLOAD_HOST_DEVICE) @@ -111,7 +114,7 @@ #define KOKKOS_ENABLE_SERIAL_ATOMICS #elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ - (defined(KOKKOS_COMPILER_NVCC)) + (defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_IBM)) #define KOKKOS_ENABLE_GNU_ATOMICS @@ -176,21 +179,11 @@ inline const char* atomic_query_version() { // Implements Strongly-typed analogs of C++ standard memory orders #include "impl/Kokkos_Atomic_Memory_Order.hpp" -#if defined(KOKKOS_ENABLE_ROCM) -namespace Kokkos { -namespace Impl { -extern KOKKOS_INLINE_FUNCTION bool lock_address_rocm_space(void* ptr); - -extern KOKKOS_INLINE_FUNCTION void unlock_address_rocm_space(void* ptr); -} // namespace Impl -} // namespace Kokkos -#include -#endif #if defined(KOKKOS_ENABLE_HIP) #include #endif -#ifdef _WIN32 +#if defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) #include "impl/Kokkos_Atomic_Windows.hpp" #endif //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 5303b85beb..fb2925a066 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -49,6 +49,10 @@ #include #include +#ifdef KOKKOS_ENABLE_SYCL +#include +#endif + namespace Kokkos { /// \class complex @@ -692,27 +696,60 @@ KOKKOS_INLINE_FUNCTION RealType real(const complex& x) noexcept { //! Absolute value (magnitude) of a complex number. template KOKKOS_INLINE_FUNCTION RealType abs(const complex& x) { - return std::hypot(x.real(), x.imag()); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::hypot; +#else + using std::hypot; +#endif + return hypot(x.real(), x.imag()); } //! Power of a complex number template KOKKOS_INLINE_FUNCTION Kokkos::complex pow(const complex& x, const RealType& e) { - RealType r = abs(x); - RealType phi = std::atan(x.imag() / x.real()); - return std::pow(r, e) * - Kokkos::complex(std::cos(phi * e), std::sin(phi * e)); + RealType r = abs(x); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::atan; + using cl::sycl::cos; + using cl::sycl::pow; + using cl::sycl::sin; +#else + using std::atan; + using std::cos; + using std::pow; + using std::sin; +#endif + RealType phi = atan(x.imag() / x.real()); + return pow(r, e) * Kokkos::complex(cos(phi * e), sin(phi * e)); } -//! Square root of a complex number. +//! Square root of a complex number. This is intended to match the stdc++ +//! implementation, which returns sqrt(z*z) = z; where z is complex number. template KOKKOS_INLINE_FUNCTION Kokkos::complex sqrt( const complex& x) { - RealType r = abs(x); - RealType phi = std::atan(x.imag() / x.real()); - return std::sqrt(r) * - Kokkos::complex(std::cos(phi * 0.5), std::sin(phi * 0.5)); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::abs; + using cl::sycl::sqrt; +#else + using std::abs; + using std::sqrt; +#endif + + RealType r = x.real(); + RealType i = x.imag(); + + if (r == RealType()) { + RealType t = sqrt(abs(i) / 2); + return Kokkos::complex(t, i < RealType() ? -t : t); + } else { + RealType t = sqrt(2 * (abs(x) + abs(r))); + RealType u = t / 2; + return r > RealType() + ? Kokkos::complex(u, i / t) + : Kokkos::complex(abs(i) / t, i < RealType() ? -u : u); + } } //! Conjugate of a complex number. @@ -725,8 +762,211 @@ KOKKOS_INLINE_FUNCTION complex conj( //! Exponential of a complex number. template KOKKOS_INLINE_FUNCTION complex exp(const complex& x) { - return std::exp(x.real()) * - complex(std::cos(x.imag()), std::sin(x.imag())); +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::cos; + using cl::sycl::exp; + using cl::sycl::sin; +#else + using std::cos; + using std::exp; + using std::sin; +#endif + return exp(x.real()) * complex(cos(x.imag()), sin(x.imag())); +} + +//! natural log of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex log( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::atan; + using cl::sycl::log; +#else + using std::atan; + using std::log; +#endif + RealType phi = atan(x.imag() / x.real()); + return Kokkos::complex(log(abs(x)), phi); +} + +//! sine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex sin( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::cos; + using cl::sycl::cosh; + using cl::sycl::sin; + using cl::sycl::sinh; +#else + using std::cos; + using std::cosh; + using std::sin; + using std::sinh; +#endif + return Kokkos::complex(sin(x.real()) * cosh(x.imag()), + cos(x.real()) * sinh(x.imag())); +} + +//! cosine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex cos( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::cos; + using cl::sycl::cosh; + using cl::sycl::sin; + using cl::sycl::sinh; +#else + using std::cos; + using std::cosh; + using std::sin; + using std::sinh; +#endif + return Kokkos::complex(cos(x.real()) * cosh(x.imag()), + -sin(x.real()) * sinh(x.imag())); +} + +//! tangent of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex tan( + const complex& x) { + return sin(x) / cos(x); +} + +//! hyperbolic sine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex sinh( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::cos; + using cl::sycl::cosh; + using cl::sycl::sin; + using cl::sycl::sinh; +#else + using std::cos; + using std::cosh; + using std::sin; + using std::sinh; +#endif + return Kokkos::complex(sinh(x.real()) * cos(x.imag()), + cosh(x.real()) * sin(x.imag())); +} + +//! hyperbolic cosine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex cosh( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::cos; + using cl::sycl::cosh; + using cl::sycl::sin; + using cl::sycl::sinh; +#else + using std::cos; + using std::cosh; + using std::sin; + using std::sinh; +#endif + return Kokkos::complex(cosh(x.real()) * cos(x.imag()), + sinh(x.real()) * sin(x.imag())); +} + +//! hyperbolic tangent of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex tanh( + const complex& x) { + return sinh(x) / cosh(x); +} + +//! inverse hyperbolic sine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex asinh( + const complex& x) { + return log(x + sqrt(x * x + RealType(1.0))); +} + +//! inverse hyperbolic cosine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex acosh( + const complex& x) { + return RealType(2.0) * log(sqrt(RealType(0.5) * (x + RealType(1.0))) + + sqrt(RealType(0.5) * (x - RealType(1.0)))); +} + +//! inverse hyperbolic tangent of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex atanh( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::atan2; + using cl::sycl::log; +#else + using std::atan2; + using std::log; +#endif + + const RealType i2 = x.imag() * x.imag(); + const RealType r = RealType(1.0) - i2 - x.real() * x.real(); + + RealType p = RealType(1.0) + x.real(); + RealType m = RealType(1.0) - x.real(); + + p = i2 + p * p; + m = i2 + m * m; + + RealType phi = atan2(RealType(2.0) * x.imag(), r); + return Kokkos::complex(RealType(0.25) * (log(p) - log(m)), + RealType(0.5) * phi); +} + +//! inverse sine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex asin( + const complex& x) { + Kokkos::complex t = + asinh(Kokkos::complex(-x.imag(), x.real())); + return Kokkos::complex(t.imag(), -t.real()); +} + +//! inverse cosine of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex acos( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::acos; + +#else + using std::acos; +#endif + Kokkos::complex t = asin(x); + RealType pi_2 = acos(RealType(0.0)); + return Kokkos::complex(pi_2 - t.real(), -t.imag()); +} + +//! inverse tangent of a complex number. +template +KOKKOS_INLINE_FUNCTION Kokkos::complex atan( + const complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using cl::sycl::atan2; + using cl::sycl::log; +#else + using std::atan2; + using std::log; +#endif + const RealType r2 = x.real() * x.real(); + const RealType i = RealType(1.0) - r2 - x.imag() * x.imag(); + + RealType p = x.imag() + RealType(1.0); + RealType m = x.imag() - RealType(1.0); + + p = r2 + p * p; + m = r2 + m * m; + + return Kokkos::complex( + RealType(0.5) * atan2(RealType(2.0) * x.real(), i), + RealType(0.25) * log(p / m)); } /// This function cannot be called in a CUDA device function, diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp index 4989f2701c..2aba189487 100644 --- a/lib/kokkos/core/src/Kokkos_Concepts.hpp +++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp @@ -196,6 +196,7 @@ KOKKOS_IMPL_IS_CONCEPT(index_type) KOKKOS_IMPL_IS_CONCEPT(launch_bounds) KOKKOS_IMPL_IS_CONCEPT(thread_team_member) KOKKOS_IMPL_IS_CONCEPT(host_thread_team_member) +KOKKOS_IMPL_IS_CONCEPT(graph_kernel) } // namespace Impl diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp index 78538dc7df..6be5483269 100644 --- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp @@ -292,6 +292,7 @@ struct ViewCopy { ViewTypeB b; using policy_type = Kokkos::RangePolicy>; + using value_type = typename ViewTypeA::value_type; ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) @@ -301,7 +302,9 @@ struct ViewCopy { } KOKKOS_INLINE_FUNCTION - void operator()(const iType& i0) const { a(i0) = b(i0); }; + void operator()(const iType& i0) const { + a(i0) = static_cast(b(i0)); + }; }; template { Kokkos::Rank<2, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = Kokkos::MDRangePolicy>; + using value_type = typename ViewTypeA::value_type; ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) @@ -328,7 +332,7 @@ struct ViewCopy { KOKKOS_INLINE_FUNCTION void operator()(const iType& i0, const iType& i1) const { - a(i0, i1) = b(i0, i1); + a(i0, i1) = static_cast(b(i0, i1)); }; }; @@ -346,6 +350,7 @@ struct ViewCopy { Kokkos::Rank<3, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = Kokkos::MDRangePolicy>; + using value_type = typename ViewTypeA::value_type; ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) @@ -358,7 +363,7 @@ struct ViewCopy { KOKKOS_INLINE_FUNCTION void operator()(const iType& i0, const iType& i1, const iType& i2) const { - a(i0, i1, i2) = b(i0, i1, i2); + a(i0, i1, i2) = static_cast(b(i0, i1, i2)); }; }; @@ -2498,7 +2503,7 @@ inline void deep_copy( typename std::enable_if< Kokkos::Impl::is_execution_space::value && std::is_same::specialize, - void>::value>::type* = 0) { + void>::value>::type* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; static_assert(src_traits::rank == 0, @@ -3221,7 +3226,7 @@ create_mirror_view_and_copy( using Mirror = typename Impl::MirrorViewType::view_type; std::string label = name.empty() ? src.label() : name; auto mirror = typename Mirror::non_const_type{ - ViewAllocateWithoutInitializing(label), src.layout()}; + view_alloc(WithoutInitializing, label), src.layout()}; deep_copy(mirror, src); return mirror; } @@ -3248,8 +3253,7 @@ typename Impl::MirrorViewType::view_type create_mirror_view( !Impl::MirrorViewType::is_same_memspace>::type* = nullptr) { using Mirror = typename Impl::MirrorViewType::view_type; - return Mirror(Kokkos::ViewAllocateWithoutInitializing(src.label()), - src.layout()); + return Mirror(view_alloc(WithoutInitializing, src.label()), src.layout()); } } /* namespace Kokkos */ diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index a1669addd6..4dac463a66 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -50,39 +50,13 @@ #include -#if defined(KOKKOS_ENABLE_SERIAL) -#include -#endif - -#if defined(KOKKOS_ENABLE_OPENMP) -#include -#endif - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) -#include -#include -#endif - -#if defined(KOKKOS_ENABLE_HPX) -#include -#endif - -#if defined(KOKKOS_ENABLE_THREADS) -#include -#endif - -#if defined(KOKKOS_ENABLE_CUDA) -#include -#endif - -#if defined(KOKKOS_ENABLE_ROCM) -#include -#endif -#if defined(KOKKOS_ENABLE_HIP) -#include -#endif +// Fundamental type description for half precision +// Should not rely on other backend infrastructure +#include +#include #include +#include #include #include #include @@ -91,11 +65,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include //---------------------------------------------------------------------------- @@ -108,16 +85,50 @@ struct InitArguments { int ndevices; int skip_device; bool disable_warnings; - - InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false) + bool tune_internals; + InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, + bool ti = false) : num_threads{nt}, num_numa{nn}, device_id{dv}, ndevices{-1}, skip_device{9999}, - disable_warnings{dw} {} + disable_warnings{dw}, + tune_internals{ti} {} }; +namespace Impl { + +/* ExecSpaceManager - Responsible for initializing all of the registered + * backends. Backends are registered using the register_space_initializer() + * function which should be called from a global context so that it is called + * prior to initialize_spaces() which is called from Kokkos::initialize() + */ +class ExecSpaceManager { + std::map> + exec_space_factory_list; + + public: + ExecSpaceManager() = default; + + void register_space_factory(std::string name, + std::unique_ptr ptr); + void initialize_spaces(const Kokkos::InitArguments& args); + void finalize_spaces(const bool all_spaces); + void static_fence(); + void print_configuration(std::ostream& msg, const bool detail); + static ExecSpaceManager& get_instance(); +}; + +template +int initialize_space_factory(std::string name) { + auto space_ptr = std::make_unique(); + ExecSpaceManager::get_instance().register_space_factory(name, + std::move(space_ptr)); + return 1; +} + +} // namespace Impl void initialize(int& narg, char* arg[]); void initialize(InitArguments args = InitArguments()); @@ -133,6 +144,7 @@ void post_initialize(const InitArguments& args); bool is_initialized() noexcept; bool show_warnings() noexcept; +bool tune_internals() noexcept; /** \brief Finalize the spaces that were initialized via Kokkos::initialize */ void finalize(); @@ -264,6 +276,8 @@ class ScopeGuard { // implementation of the RAII wrapper is using Kokkos::single. #include +// Specializations requires after core definitions +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index 7667dde4e6..7502719c73 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -87,58 +87,16 @@ namespace Kokkos { class HostSpace; ///< Memory space for main process and CPU execution spaces class AnonymousSpace; -#ifdef KOKKOS_ENABLE_HBWSPACE -namespace Experimental { -class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL - /// processor) -} -#endif - -#if defined(KOKKOS_ENABLE_SERIAL) -class Serial; ///< Execution space main process on CPU. -#endif - -#if defined(KOKKOS_ENABLE_HPX) -namespace Experimental { -class HPX; ///< Execution space with HPX back-end. -} -#endif - -#if defined(KOKKOS_ENABLE_THREADS) -class Threads; ///< Execution space with pthreads back-end. -#endif - -#if defined(KOKKOS_ENABLE_OPENMP) -class OpenMP; ///< OpenMP execution space. -#endif - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) -namespace Experimental { -class OpenMPTarget; ///< OpenMPTarget execution space. -class OpenMPTargetSpace; -} // namespace Experimental -#endif - -#if defined(KOKKOS_ENABLE_ROCM) -namespace Experimental { -class ROCmSpace; ///< Memory space on ROCm GPU -class ROCm; ///< Execution space for ROCm GPU -} // namespace Experimental -#endif - -#if defined(KOKKOS_ENABLE_HIP) -namespace Experimental { -class HIPSpace; ///< Memory space on HIP GPU -class HIP; ///< Execution space for HIP GPU -} // namespace Experimental -#endif - template struct Device; +// forward declare here so that backend initializer calls can use it. +struct InitArguments; + } // namespace Kokkos -#include "Cuda/Kokkos_Cuda_fwd.hpp" +// Include backend forward statements as determined by build options +#include //---------------------------------------------------------------------------- // Set the default execution space. @@ -168,9 +126,9 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Experimental::HIP; -#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM) +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = - Experimental::ROCm; + Experimental::SYCL; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = OpenMP; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) @@ -182,7 +140,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial; #else #error \ - "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::HIP, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." #endif #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) @@ -228,8 +186,8 @@ namespace Impl { #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) && \ defined(KOKKOS_ENABLE_CUDA) using ActiveExecutionMemorySpace = Kokkos::CudaSpace; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU) -using ActiveExecutionMemorySpace = Kokkos::HostSpace; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) +using ActiveExecutionMemorySpace = Kokkos::Experimental::SYCLDeviceUSMSpace; #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU) using ActiveExecutionMemorySpace = Kokkos::Experimental::HIPSpace; #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) @@ -249,8 +207,17 @@ struct VerifyExecutionCanAccessMemorySpace { KOKKOS_INLINE_FUNCTION static void verify(void) {} KOKKOS_INLINE_FUNCTION static void verify(const void *) {} }; + +// Base class for exec space initializer factories +class ExecSpaceInitializerBase; + } // namespace Impl +namespace Experimental { +template +class LogicalMemorySpace; +} + } // namespace Kokkos #define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR) \ diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index dfb884e514..4a573d82c0 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -287,7 +287,7 @@ void get_crs_transpose_counts( template typename OutRowMap::value_type get_crs_row_map_from_counts( OutRowMap& out, InCounts const& in, std::string const& name) { - out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1); + out = OutRowMap(view_alloc(WithoutInitializing, name), in.size() + 1); Kokkos::Impl::CrsRowMapFromCounts functor(in, out); return functor.execute(); } diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp index a5b2182469..81e11f3f12 100644 --- a/lib/kokkos/core/src/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp @@ -62,6 +62,7 @@ #include #include #include +#include /*--------------------------------------------------------------------------*/ @@ -270,6 +271,20 @@ struct DeviceTypeTraits { }; } // namespace Experimental } // namespace Tools + +namespace Impl { + +class CudaSpaceInitializer : public ExecSpaceInitializerBase { + public: + CudaSpaceInitializer() = default; + ~CudaSpaceInitializer() = default; + void initialize(const InitArguments& args) final; + void finalize(const bool all_spaces) final; + void fence() final; + void print_configuration(std::ostream& msg, const bool detail) final; +}; + +} // namespace Impl } // namespace Kokkos /*--------------------------------------------------------------------------*/ @@ -281,9 +296,9 @@ namespace Impl { template <> struct MemorySpaceAccess { - enum { assignable = false }; - enum { accessible = true }; - enum { deepcopy = false }; + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; }; #if defined(KOKKOS_ENABLE_CUDA_UVM) @@ -297,9 +312,9 @@ struct MemorySpaceAccess struct MemorySpaceAccess { - enum { assignable = false }; - enum { accessible = true }; - enum { deepcopy = false }; + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; }; #endif @@ -307,7 +322,7 @@ struct MemorySpaceAccess struct VerifyExecutionCanAccessMemorySpace { - enum { value = true }; + enum : bool { value = true }; KOKKOS_INLINE_FUNCTION static void verify(void) {} KOKKOS_INLINE_FUNCTION static void verify(const void*) {} }; @@ -315,7 +330,7 @@ struct VerifyExecutionCanAccessMemorySpace struct VerifyExecutionCanAccessMemorySpace { - enum { value = false }; + enum : bool { value = false }; inline static void verify(void) { CudaSpace::access_error(); } inline static void verify(const void* p) { CudaSpace::access_error(p); } }; diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp index 0fb7841889..fc1c0e2f8a 100644 --- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -100,6 +100,20 @@ class CudaSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + private: + template + friend class Kokkos::Experimental::LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: /**\brief Return Name of the MemorySpace */ static constexpr const char* name() { return m_name; } @@ -197,6 +211,20 @@ class CudaUVMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + private: + template + friend class Kokkos::Experimental::LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: /**\brief Return Name of the MemorySpace */ static constexpr const char* name() { return m_name; } @@ -254,6 +282,20 @@ class CudaHostPinnedSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + private: + template + friend class Kokkos::Experimental::LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: /**\brief Return Name of the MemorySpace */ static constexpr const char* name() { return m_name; } @@ -286,50 +328,50 @@ static_assert( template <> struct MemorySpaceAccess { - enum { assignable = false }; - enum { accessible = false }; - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // HostSpace::execution_space != CudaUVMSpace::execution_space - enum { assignable = false }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // HostSpace::execution_space == CudaHostPinnedSpace::execution_space - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; //---------------------------------------- template <> struct MemorySpaceAccess { - enum { assignable = false }; - enum { accessible = false }; - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // CudaSpace::execution_space == CudaUVMSpace::execution_space - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // CudaSpace::execution_space != CudaHostPinnedSpace::execution_space - enum { assignable = false }; - enum { accessible = true }; // CudaSpace::execution_space - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = true }; // CudaSpace::execution_space + enum : bool { deepcopy = true }; }; //---------------------------------------- @@ -338,28 +380,28 @@ struct MemorySpaceAccess { template <> struct MemorySpaceAccess { - enum { assignable = false }; - enum { accessible = false }; // Cuda cannot access HostSpace - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = false }; // Cuda cannot access HostSpace + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // CudaUVMSpace::execution_space == CudaSpace::execution_space // Can access CudaUVMSpace from Host but cannot access CudaSpace from Host - enum { assignable = false }; + enum : bool { assignable = false }; // CudaUVMSpace::execution_space can access CudaSpace - enum { accessible = true }; - enum { deepcopy = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { // CudaUVMSpace::execution_space != CudaHostPinnedSpace::execution_space - enum { assignable = false }; - enum { accessible = true }; // CudaUVMSpace::execution_space - enum { deepcopy = true }; + enum : bool { assignable = false }; + enum : bool { accessible = true }; // CudaUVMSpace::execution_space + enum : bool { deepcopy = true }; }; //---------------------------------------- @@ -368,23 +410,23 @@ struct MemorySpaceAccess { template <> struct MemorySpaceAccess { - enum { assignable = false }; // Cannot access from Cuda - enum { accessible = true }; // CudaHostPinnedSpace::execution_space - enum { deepcopy = true }; + enum : bool { assignable = false }; // Cannot access from Cuda + enum : bool { accessible = true }; // CudaHostPinnedSpace::execution_space + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { - enum { assignable = false }; // Cannot access from Host - enum { accessible = false }; - enum { deepcopy = true }; + enum : bool { assignable = false }; // Cannot access from Host + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; }; template <> struct MemorySpaceAccess { - enum { assignable = false }; // different execution_space - enum { accessible = true }; // same accessibility - enum { deepcopy = true }; + enum : bool { assignable = false }; // different execution_space + enum : bool { accessible = true }; // same accessibility + enum : bool { deepcopy = true }; }; //---------------------------------------- @@ -746,7 +788,7 @@ namespace Impl { template <> struct VerifyExecutionCanAccessMemorySpace { - enum { value = false }; + enum : bool { value = false }; KOKKOS_INLINE_FUNCTION static void verify(void) { Kokkos::abort("Cuda code attempted to access HostSpace memory"); } @@ -760,7 +802,7 @@ struct VerifyExecutionCanAccessMemorySpace struct VerifyExecutionCanAccessMemorySpace { - enum { value = true }; + enum : bool { value = true }; KOKKOS_INLINE_FUNCTION static void verify(void) {} KOKKOS_INLINE_FUNCTION static void verify(const void*) {} }; @@ -769,7 +811,7 @@ struct VerifyExecutionCanAccessMemorySpace struct VerifyExecutionCanAccessMemorySpace { - enum { value = true }; + enum : bool { value = true }; KOKKOS_INLINE_FUNCTION static void verify(void) {} KOKKOS_INLINE_FUNCTION static void verify(const void*) {} }; @@ -780,7 +822,7 @@ struct VerifyExecutionCanAccessMemorySpace< typename std::enable_if::value, Kokkos::CudaSpace>::type, OtherSpace> { - enum { value = false }; + enum : bool { value = false }; KOKKOS_INLINE_FUNCTION static void verify(void) { Kokkos::abort("Cuda code attempted to access unknown Space memory"); } @@ -795,7 +837,7 @@ struct VerifyExecutionCanAccessMemorySpace< template <> struct VerifyExecutionCanAccessMemorySpace { - enum { value = false }; + enum : bool { value = false }; inline static void verify(void) { CudaSpace::access_error(); } inline static void verify(const void* p) { CudaSpace::access_error(p); } }; @@ -804,7 +846,7 @@ struct VerifyExecutionCanAccessMemorySpace struct VerifyExecutionCanAccessMemorySpace { - enum { value = true }; + enum : bool { value = true }; inline static void verify(void) {} inline static void verify(const void*) {} }; @@ -813,7 +855,7 @@ struct VerifyExecutionCanAccessMemorySpace struct VerifyExecutionCanAccessMemorySpace { - enum { value = true }; + enum : bool { value = true }; KOKKOS_INLINE_FUNCTION static void verify(void) {} KOKKOS_INLINE_FUNCTION static void verify(const void*) {} }; @@ -844,7 +886,7 @@ class SharedAllocationRecord const unsigned sizeof_alias, void* const alloc_ptr, const size_t alloc_size); -#ifdef KOKKOS_DEBUG +#ifdef KOKKOS_ENABLE_DEBUG static RecordBase s_root_record; #endif diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index 17eef76038..3afe081701 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -123,13 +123,19 @@ class RangePolicy : public Impl::PolicyTraits { template RangePolicy(const RangePolicy& p) - : m_space(p.m_space), + : traits(p), // base class may contain data such as desired occupancy + m_space(p.m_space), m_begin(p.m_begin), m_end(p.m_end), m_granularity(p.m_granularity), m_granularity_mask(p.m_granularity_mask) {} - inline RangePolicy() : m_space(), m_begin(0), m_end(0) {} + inline RangePolicy() + : m_space(), + m_begin(0), + m_end(0), + m_granularity(0), + m_granularity_mask(0) {} /** \brief Total range */ inline RangePolicy(const typename traits::execution_space& work_space, @@ -358,6 +364,17 @@ class TeamPolicyInternal : public Impl::PolicyTraits { */ KOKKOS_INLINE_FUNCTION int team_size() const; + /** \brief Whether the policy has an automatically determined team size + */ + inline bool impl_auto_team_size() const; + /** \brief Whether the policy has an automatically determined vector length + */ + inline bool impl_auto_vector_length() const; + + static int vector_length_max(); + + KOKKOS_INLINE_FUNCTION int impl_vector_length() const; + inline typename traits::index_type chunk_size() const; inline TeamPolicyInternal& set_chunk_size(int chunk_size); @@ -554,6 +571,16 @@ class TeamPolicy : internal_policy(space_, league_size_request, Kokkos::AUTO(), vector_length_request) {} + TeamPolicy(const typename traits::execution_space& space_, + int league_size_request, const Kokkos::AUTO_t&, + const Kokkos::AUTO_t&) + : internal_policy(space_, league_size_request, Kokkos::AUTO(), + Kokkos::AUTO()) {} + TeamPolicy(const typename traits::execution_space& space_, + int league_size_request, const int team_size_request, + const Kokkos::AUTO_t&) + : internal_policy(space_, league_size_request, team_size_request, + Kokkos::AUTO()) {} /** \brief Construct policy with the default instance of the execution space */ TeamPolicy(int league_size_request, int team_size_request, @@ -566,8 +593,20 @@ class TeamPolicy : internal_policy(league_size_request, Kokkos::AUTO(), vector_length_request) {} + TeamPolicy(int league_size_request, const Kokkos::AUTO_t&, + const Kokkos::AUTO_t&) + : internal_policy(league_size_request, Kokkos::AUTO(), Kokkos::AUTO()) {} + TeamPolicy(int league_size_request, const int team_size_request, + const Kokkos::AUTO_t&) + : internal_policy(league_size_request, team_size_request, + Kokkos::AUTO()) {} + template - TeamPolicy(const TeamPolicy p) : internal_policy(p) {} + TeamPolicy(const TeamPolicy p) : internal_policy(p) { + // Cannot call converting constructor in the member initializer list because + // it is not a direct base. + internal_policy::traits::operator=(p); + } private: TeamPolicy(const internal_policy& p) : internal_policy(p) {} @@ -869,32 +908,50 @@ namespace Impl { template struct PolicyPropertyAdaptor; -template +template class Policy, + class... Properties> struct PolicyPropertyAdaptor, - RangePolicy> { - using policy_in_t = RangePolicy; - using policy_out_t = - RangePolicy>; + Policy> { + using policy_in_t = Policy; + static_assert(is_execution_policy::value, ""); + using policy_out_t = Policy, + typename policy_in_t::traits::occupancy_control>; }; -template -struct PolicyPropertyAdaptor, - TeamPolicy> { - using policy_in_t = TeamPolicy; - using policy_out_t = - TeamPolicy>; +template