From a9f0b7d523fa86538d1b1b19e3da8a049ad3f439 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 9 Jan 2017 10:39:46 -0700 Subject: [PATCH] Updating Kokkos lib --- lib/kokkos/.gitignore | 8 - lib/kokkos/CHANGELOG.md | 284 ++ lib/kokkos/CMakeLists.txt | 42 +- lib/kokkos/Makefile.kokkos | 260 +- lib/kokkos/Makefile.targets | 18 +- lib/kokkos/README | 26 +- lib/kokkos/algorithms/src/Kokkos_Random.hpp | 48 +- lib/kokkos/algorithms/src/Kokkos_Sort.hpp | 149 +- .../algorithms/unit_tests/CMakeLists.txt | 2 +- lib/kokkos/algorithms/unit_tests/Makefile | 21 +- lib/kokkos/algorithms/unit_tests/TestSort.hpp | 6 +- .../benchmarks/bytes_and_flops/Makefile | 43 + .../benchmarks/bytes_and_flops/bench.hpp | 99 + .../bytes_and_flops/bench_stride.hpp | 124 + .../bytes_and_flops/bench_unroll_stride.hpp | 148 + .../benchmarks/bytes_and_flops/main.cpp | 96 + lib/kokkos/benchmarks/gather/Makefile | 44 + lib/kokkos/benchmarks/gather/gather.hpp | 92 + .../benchmarks/gather/gather_unroll.hpp | 169 + .../gather/main.cpp} | 111 +- lib/kokkos/bin/nvcc_wrapper | 284 ++ lib/kokkos/cmake/deps/CUSPARSE.cmake | 14 +- lib/kokkos/cmake/tribits.cmake | 22 + lib/kokkos/config/configure_compton_cpu.sh | 0 lib/kokkos/config/configure_compton_mic.sh | 0 lib/kokkos/config/configure_kokkos.sh | 0 lib/kokkos/config/configure_kokkos_nvidia.sh | 0 lib/kokkos/config/configure_shannon.sh | 0 .../kokkos-trilinos-integration-procedure.txt | 17 +- lib/kokkos/config/master_history.txt | 5 +- lib/kokkos/config/nvcc_wrapper | 4 + lib/kokkos/config/test_all_sandia | 344 +- .../prepare_trilinos_repos.sh | 50 + .../performance_tests/CMakeLists.txt | 2 +- .../containers/performance_tests/Makefile | 21 +- .../containers/performance_tests/TestCuda.cpp | 2 +- .../performance_tests/TestDynRankView.hpp | 4 +- lib/kokkos/containers/src/Kokkos_DualView.hpp | 378 +- .../containers/src/Kokkos_DynRankView.hpp | 214 +- .../containers/src/Kokkos_DynamicView.hpp | 38 +- .../containers/src/Kokkos_ErrorReporter.hpp | 196 + .../containers/src/Kokkos_SegmentedView.hpp | 531 --- .../containers/src/Kokkos_UnorderedMap.hpp | 10 +- .../containers/unit_tests/CMakeLists.txt | 2 +- lib/kokkos/containers/unit_tests/Makefile | 21 +- lib/kokkos/containers/unit_tests/TestCuda.cpp | 28 +- .../containers/unit_tests/TestDynViewAPI.hpp | 11 +- .../unit_tests/TestErrorReporter.hpp | 227 ++ .../containers/unit_tests/TestOpenMP.cpp | 28 +- .../unit_tests/TestSegmentedView.hpp | 708 ---- .../containers/unit_tests/TestSerial.cpp | 24 +- .../containers/unit_tests/TestThreads.cpp | 26 +- lib/kokkos/core/cmake/Dependencies.cmake | 2 + lib/kokkos/core/cmake/KokkosCore_config.h.in | 10 + lib/kokkos/core/perf_test/CMakeLists.txt | 4 +- lib/kokkos/core/perf_test/Makefile | 21 +- lib/kokkos/core/perf_test/PerfTestHost.cpp | 17 +- .../core/src/Cuda/KokkosExp_Cuda_View.hpp | 334 -- lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp | 103 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp | 8 +- .../core/src/Cuda/Kokkos_Cuda_Parallel.hpp | 272 +- .../core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp | 23 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp | 4 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp | 46 +- .../core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp | 932 ----- .../core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp | 833 ----- lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp | 269 +- .../core/src/Cuda/Kokkos_Cuda_abort.hpp | 36 +- lib/kokkos/core/src/Kokkos_Atomic.hpp | 19 +- lib/kokkos/core/src/Kokkos_Concepts.hpp | 268 +- lib/kokkos/core/src/Kokkos_Core.hpp | 12 +- lib/kokkos/core/src/Kokkos_Core_fwd.hpp | 19 +- lib/kokkos/core/src/Kokkos_Cuda.hpp | 38 +- lib/kokkos/core/src/Kokkos_CudaSpace.hpp | 150 +- lib/kokkos/core/src/Kokkos_ExecPolicy.hpp | 165 +- lib/kokkos/core/src/Kokkos_HBWSpace.hpp | 35 +- lib/kokkos/core/src/Kokkos_HostSpace.hpp | 54 +- lib/kokkos/core/src/Kokkos_Layout.hpp | 14 +- lib/kokkos/core/src/Kokkos_Macros.hpp | 99 +- lib/kokkos/core/src/Kokkos_MemoryPool.hpp | 289 +- lib/kokkos/core/src/Kokkos_MemoryTraits.hpp | 4 + lib/kokkos/core/src/Kokkos_OpenMP.hpp | 13 +- .../core/src/Kokkos_Parallel_Reduce.hpp | 126 +- lib/kokkos/core/src/Kokkos_Qthread.hpp | 11 + lib/kokkos/core/src/Kokkos_Serial.hpp | 35 +- lib/kokkos/core/src/Kokkos_TaskPolicy.hpp | 1066 +----- lib/kokkos/core/src/Kokkos_TaskScheduler.hpp | 700 ++++ lib/kokkos/core/src/Kokkos_Threads.hpp | 11 + lib/kokkos/core/src/Kokkos_Timer.hpp | 112 + lib/kokkos/core/src/Kokkos_View.hpp | 462 ++- lib/kokkos/core/src/Makefile | 64 +- .../core/src/OpenMP/Kokkos_OpenMP_Task.cpp | 4 +- .../core/src/OpenMP/Kokkos_OpenMP_Task.hpp | 29 +- .../core/src/OpenMP/Kokkos_OpenMPexec.cpp | 12 +- .../core/src/OpenMP/Kokkos_OpenMPexec.hpp | 57 +- .../src/Qthread/Kokkos_Qthread_Parallel.hpp | 45 +- .../src/Qthread/Kokkos_Qthread_TaskPolicy.cpp | 6 +- .../src/Qthread/Kokkos_Qthread_TaskPolicy.hpp | 30 +- .../core/src/Threads/Kokkos_ThreadsExec.cpp | 12 +- .../core/src/Threads/Kokkos_ThreadsTeam.hpp | 25 +- .../src/Threads/Kokkos_Threads_TaskPolicy.cpp | 930 ----- .../src/Threads/Kokkos_Threads_TaskPolicy.hpp | 745 ---- .../core/src/impl/KokkosExp_ViewMapping.hpp | 2890 +-------------- .../core/src/impl/Kokkos_AnalyzeShape.hpp | 260 -- .../Kokkos_Atomic_Compare_Exchange_Strong.hpp | 51 +- .../core/src/impl/Kokkos_Atomic_Decrement.hpp | 12 +- .../core/src/impl/Kokkos_Atomic_Exchange.hpp | 57 +- .../core/src/impl/Kokkos_Atomic_Fetch_Add.hpp | 64 +- .../core/src/impl/Kokkos_Atomic_Fetch_And.hpp | 20 +- .../core/src/impl/Kokkos_Atomic_Fetch_Or.hpp | 20 +- .../core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp | 46 +- .../core/src/impl/Kokkos_Atomic_Generic.hpp | 62 +- .../core/src/impl/Kokkos_CPUDiscovery.cpp | 14 +- lib/kokkos/core/src/impl/Kokkos_Core.cpp | 61 +- lib/kokkos/core/src/impl/Kokkos_Error.hpp | 16 +- .../core/src/impl/Kokkos_FunctorAdapter.hpp | 10 +- lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp | 28 +- lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp | 72 +- .../core/src/impl/Kokkos_Memory_Fence.hpp | 2 +- .../src/impl/Kokkos_Profiling_Interface.cpp | 55 +- .../src/impl/Kokkos_Profiling_Interface.hpp | 33 + .../core/src/impl/Kokkos_Serial_Task.cpp | 5 +- .../core/src/impl/Kokkos_Serial_Task.hpp | 107 +- .../src/impl/Kokkos_Serial_TaskPolicy.cpp | 348 -- .../src/impl/Kokkos_Serial_TaskPolicy.hpp | 677 ---- lib/kokkos/core/src/impl/Kokkos_Shape.cpp | 178 - lib/kokkos/core/src/impl/Kokkos_Shape.hpp | 917 ----- ...SharedAlloc.cpp => Kokkos_SharedAlloc.cpp} | 26 +- ...SharedAlloc.hpp => Kokkos_SharedAlloc.hpp} | 12 +- lib/kokkos/core/src/impl/Kokkos_Tags.hpp | 127 +- lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp | 40 +- .../core/src/impl/Kokkos_TaskQueue_impl.hpp | 7 +- lib/kokkos/core/src/impl/Kokkos_Timer.hpp | 67 +- lib/kokkos/core/src/impl/Kokkos_Utilities.hpp | 414 +++ ...Exp_ViewArray.hpp => Kokkos_ViewArray.hpp} | 24 +- ...osExp_ViewCtor.hpp => Kokkos_ViewCtor.hpp} | 2 - .../core/src/impl/Kokkos_ViewDefault.hpp | 886 ----- .../core/src/impl/Kokkos_ViewMapping.hpp | 3156 +++++++++++++++++ .../core/src/impl/Kokkos_ViewOffset.hpp | 1341 ------- .../core/src/impl/Kokkos_ViewSupport.hpp | 393 -- ...osExp_ViewTile.hpp => Kokkos_ViewTile.hpp} | 20 +- lib/kokkos/core/unit_test/CMakeLists.txt | 102 +- lib/kokkos/core/unit_test/Makefile | 74 +- lib/kokkos/core/unit_test/TestAggregate.hpp | 2 +- .../core/unit_test/TestAggregateReduction.hpp | 191 - .../core/unit_test/TestAtomicOperations.hpp | 144 + .../core/unit_test/TestCompilerMacros.hpp | 2 + lib/kokkos/core/unit_test/TestCuda.cpp | 290 -- lib/kokkos/core/unit_test/TestCuda_a.cpp | 182 - lib/kokkos/core/unit_test/TestCuda_b.cpp | 191 - .../core/unit_test/TestDefaultDeviceType.cpp | 177 +- .../unit_test/TestDefaultDeviceTypeInit.hpp | 28 +- .../unit_test/TestDefaultDeviceType_a.cpp | 4 +- .../TestDefaultDeviceType_b.cpp} | 45 +- ...acking.hpp => TestDefaultDeviceType_c.cpp} | 58 +- .../unit_test/TestDefaultDeviceType_d.cpp | 237 ++ lib/kokkos/core/unit_test/TestMemoryPool.hpp | 36 +- lib/kokkos/core/unit_test/TestOpenMP_c.cpp | 262 -- .../core/unit_test/TestPolicyConstruction.hpp | 4 + lib/kokkos/core/unit_test/TestQthread.cpp | 13 +- lib/kokkos/core/unit_test/TestRange.hpp | 4 +- lib/kokkos/core/unit_test/TestReduce.hpp | 69 +- lib/kokkos/core/unit_test/TestSerial.cpp | 571 --- lib/kokkos/core/unit_test/TestSharedAlloc.hpp | 12 +- lib/kokkos/core/unit_test/TestSynchronic.cpp | 2 +- lib/kokkos/core/unit_test/TestSynchronic.hpp | 1 + lib/kokkos/core/unit_test/TestTaskPolicy.hpp | 1145 ------ .../core/unit_test/TestTaskScheduler.hpp | 551 +++ lib/kokkos/core/unit_test/TestTeam.hpp | 141 +- lib/kokkos/core/unit_test/TestTeamVector.hpp | 81 +- lib/kokkos/core/unit_test/TestThreads.cpp | 614 ---- lib/kokkos/core/unit_test/TestTile.hpp | 3 +- lib/kokkos/core/unit_test/TestUtilities.hpp | 306 ++ lib/kokkos/core/unit_test/TestViewAPI.hpp | 55 - lib/kokkos/core/unit_test/TestViewImpl.hpp | 289 -- lib/kokkos/core/unit_test/TestViewMapping.hpp | 268 +- lib/kokkos/core/unit_test/TestViewOfClass.hpp | 32 - lib/kokkos/core/unit_test/TestViewSubview.hpp | 565 ++- lib/kokkos/core/unit_test/cuda/TestCuda.hpp | 107 + .../TestCuda_Atomics.cpp} | 233 +- .../core/unit_test/cuda/TestCuda_Other.cpp | 189 + .../unit_test/cuda/TestCuda_Reductions_a.cpp | 56 + .../unit_test/cuda/TestCuda_Reductions_b.cpp | 130 + .../core/unit_test/cuda/TestCuda_Spaces.cpp | 399 +++ .../unit_test/cuda/TestCuda_SubView_a.cpp | 92 + .../unit_test/cuda/TestCuda_SubView_b.cpp | 60 + .../unit_test/cuda/TestCuda_SubView_c01.cpp | 52 + .../unit_test/cuda/TestCuda_SubView_c02.cpp | 52 + .../unit_test/cuda/TestCuda_SubView_c03.cpp | 52 + .../cuda/TestCuda_SubView_c04.cpp} | 23 +- .../unit_test/cuda/TestCuda_SubView_c05.cpp | 52 + .../unit_test/cuda/TestCuda_SubView_c06.cpp | 52 + .../unit_test/cuda/TestCuda_SubView_c07.cpp | 52 + .../unit_test/cuda/TestCuda_SubView_c08.cpp | 52 + .../unit_test/cuda/TestCuda_SubView_c09.cpp | 52 + .../unit_test/cuda/TestCuda_SubView_c10.cpp | 52 + .../unit_test/cuda/TestCuda_SubView_c11.cpp | 52 + .../unit_test/cuda/TestCuda_SubView_c12.cpp | 52 + .../unit_test/cuda/TestCuda_SubView_c_all.cpp | 12 + .../core/unit_test/cuda/TestCuda_Team.cpp | 120 + .../unit_test/cuda/TestCuda_ViewAPI_a.cpp | 59 + .../unit_test/cuda/TestCuda_ViewAPI_b.cpp | 53 + .../unit_test/cuda/TestCuda_ViewAPI_c.cpp | 53 + .../unit_test/cuda/TestCuda_ViewAPI_d.cpp | 112 + .../unit_test/cuda/TestCuda_ViewAPI_e.cpp | 63 + .../unit_test/cuda/TestCuda_ViewAPI_f.cpp | 55 + .../unit_test/cuda/TestCuda_ViewAPI_g.cpp | 52 + .../unit_test/cuda/TestCuda_ViewAPI_h.cpp | 52 + .../core/unit_test/openmp/TestOpenMP.hpp | 116 + .../TestOpenMP_Atomics.cpp} | 124 +- .../unit_test/openmp/TestOpenMP_Other.cpp | 189 + .../openmp/TestOpenMP_Reductions.cpp | 138 + .../TestOpenMP_SubView_a.cpp} | 68 +- .../unit_test/openmp/TestOpenMP_SubView_b.cpp | 60 + .../openmp/TestOpenMP_SubView_c01.cpp | 52 + .../openmp/TestOpenMP_SubView_c02.cpp | 52 + .../openmp/TestOpenMP_SubView_c03.cpp | 52 + .../openmp/TestOpenMP_SubView_c04.cpp | 52 + .../openmp/TestOpenMP_SubView_c05.cpp | 52 + .../openmp/TestOpenMP_SubView_c06.cpp | 52 + .../openmp/TestOpenMP_SubView_c07.cpp | 52 + .../openmp/TestOpenMP_SubView_c08.cpp | 52 + .../openmp/TestOpenMP_SubView_c09.cpp | 52 + .../openmp/TestOpenMP_SubView_c10.cpp} | 22 +- .../openmp/TestOpenMP_SubView_c11.cpp | 52 + .../openmp/TestOpenMP_SubView_c12.cpp | 52 + .../openmp/TestOpenMP_SubView_c_all.cpp | 12 + .../TestOpenMP_Team.cpp} | 141 +- .../unit_test/openmp/TestOpenMP_ViewAPI_a.cpp | 53 + .../unit_test/openmp/TestOpenMP_ViewAPI_b.cpp | 121 + .../core/unit_test/serial/TestSerial.hpp | 102 + .../unit_test/serial/TestSerial_Atomics.cpp | 168 + .../unit_test/serial/TestSerial_Other.cpp | 165 + .../serial/TestSerial_Reductions.cpp | 122 + .../unit_test/serial/TestSerial_SubView_a.cpp | 92 + .../unit_test/serial/TestSerial_SubView_b.cpp | 60 + .../serial/TestSerial_SubView_c01.cpp | 52 + .../serial/TestSerial_SubView_c02.cpp | 52 + .../serial/TestSerial_SubView_c03.cpp | 52 + .../serial/TestSerial_SubView_c04.cpp | 52 + .../serial/TestSerial_SubView_c05.cpp | 52 + .../serial/TestSerial_SubView_c06.cpp | 52 + .../serial/TestSerial_SubView_c07.cpp | 52 + .../serial/TestSerial_SubView_c08.cpp | 52 + .../serial/TestSerial_SubView_c09.cpp | 52 + .../serial/TestSerial_SubView_c10.cpp | 52 + .../serial/TestSerial_SubView_c11.cpp | 52 + .../serial/TestSerial_SubView_c12.cpp | 52 + .../serial/TestSerial_SubView_c_all.cpp | 12 + .../core/unit_test/serial/TestSerial_Team.cpp | 117 + .../unit_test/serial/TestSerial_ViewAPI_a.cpp | 53 + .../unit_test/serial/TestSerial_ViewAPI_b.cpp | 121 + .../core/unit_test/threads/TestThreads.hpp | 114 + .../unit_test/threads/TestThreads_Atomics.cpp | 168 + .../unit_test/threads/TestThreads_Other.cpp | 189 + .../threads/TestThreads_Reductions.cpp | 138 + .../threads/TestThreads_SubView_a.cpp | 92 + .../threads/TestThreads_SubView_b.cpp | 60 + .../threads/TestThreads_SubView_c01.cpp | 52 + .../threads/TestThreads_SubView_c02.cpp | 52 + .../threads/TestThreads_SubView_c03.cpp | 52 + .../threads/TestThreads_SubView_c04.cpp | 52 + .../threads/TestThreads_SubView_c05.cpp | 52 + .../threads/TestThreads_SubView_c06.cpp | 52 + .../threads/TestThreads_SubView_c07.cpp | 52 + .../threads/TestThreads_SubView_c08.cpp | 52 + .../threads/TestThreads_SubView_c09.cpp | 52 + .../threads/TestThreads_SubView_c10.cpp | 52 + .../threads/TestThreads_SubView_c11.cpp | 52 + .../threads/TestThreads_SubView_c12.cpp | 52 + .../unit_test/threads/TestThreads_Team.cpp | 122 + .../threads/TestThreads_ViewAPI_a.cpp | 53 + .../threads/TestThreads_ViewAPI_b.cpp | 121 + lib/kokkos/doc/README | 32 - .../doc/design_notes_space_instances.md | 166 + lib/kokkos/example/common/VectorImport.hpp | 4 +- lib/kokkos/example/feint/ElemFunctor.hpp | 4 - lib/kokkos/example/feint/Makefile | 32 +- lib/kokkos/example/fenl/Makefile | 20 +- lib/kokkos/example/fenl/fenl_impl.hpp | 2 +- lib/kokkos/example/fixture/Makefile | 30 +- .../example/global_2_local_ids/Makefile | 27 +- lib/kokkos/example/grow_array/Makefile | 27 +- lib/kokkos/example/ichol/Makefile | 63 - .../example_chol_performance_device.hpp | 240 -- .../example_chol_performance_device_cuda.cpp | 70 - ...xample_chol_performance_device_pthread.cpp | 67 - lib/kokkos/example/ichol/src/chol.hpp | 92 - lib/kokkos/example/ichol/src/chol_u.hpp | 23 - .../ichol/src/chol_u_right_look_by_blocks.hpp | 394 -- .../ichol/src/chol_u_unblocked_opt1.hpp | 90 - .../ichol/src/chol_u_unblocked_opt2.hpp | 154 - lib/kokkos/example/ichol/src/control.hpp | 110 - lib/kokkos/example/ichol/src/coo.hpp | 75 - .../example/ichol/src/crs_matrix_base.hpp | 598 ---- .../ichol/src/crs_matrix_base_import.hpp | 104 - .../example/ichol/src/crs_matrix_helper.hpp | 71 - .../ichol/src/crs_matrix_helper_impl.hpp | 364 -- .../example/ichol/src/crs_matrix_view.hpp | 226 -- lib/kokkos/example/ichol/src/crs_row_view.hpp | 185 - lib/kokkos/example/ichol/src/dot.hpp | 74 - lib/kokkos/example/ichol/src/gemm.hpp | 99 - lib/kokkos/example/ichol/src/gemm_ct_nt.hpp | 12 - .../src/gemm_ct_nt_for_factor_blocked.hpp | 108 - .../example/ichol/src/graph_helper_scotch.hpp | 427 --- lib/kokkos/example/ichol/src/herk.hpp | 91 - lib/kokkos/example/ichol/src/herk_u_ct.hpp | 11 - .../src/herk_u_ct_for_factor_blocked.hpp | 103 - lib/kokkos/example/ichol/src/norm.hpp | 82 - lib/kokkos/example/ichol/src/partition.hpp | 381 -- lib/kokkos/example/ichol/src/scale.hpp | 92 - .../ichol/src/symbolic_factor_helper.hpp | 379 -- .../example/ichol/src/symbolic_task.hpp | 118 - lib/kokkos/example/ichol/src/task_factory.hpp | 77 - lib/kokkos/example/ichol/src/task_view.hpp | 104 - lib/kokkos/example/ichol/src/trsm.hpp | 92 - lib/kokkos/example/ichol/src/trsm_l_u_ct.hpp | 14 - .../src/trsm_l_u_ct_for_factor_blocked.hpp | 185 - lib/kokkos/example/ichol/src/util.cpp | 4 - lib/kokkos/example/ichol/src/util.hpp | 237 -- lib/kokkos/example/md_skeleton/Makefile | 27 +- lib/kokkos/example/multi_fem/Makefile | 28 +- lib/kokkos/example/query_device/Makefile | 27 +- lib/kokkos/example/sort_array/Makefile | 27 +- lib/kokkos/example/sort_array/sort_array.hpp | 2 +- .../example/tutorial/01_hello_world/Makefile | 17 +- .../tutorial/01_hello_world_lambda/Makefile | 19 +- .../hello_world_lambda.cpp | 5 +- .../tutorial/02_simple_reduce/Makefile | 17 +- .../tutorial/02_simple_reduce_lambda/Makefile | 19 +- .../simple_reduce_lambda.cpp | 8 + .../example/tutorial/03_simple_view/Makefile | 17 +- .../tutorial/03_simple_view_lambda/Makefile | 19 +- .../simple_view_lambda.cpp | 4 + .../tutorial/04_simple_memoryspaces/Makefile | 17 +- .../tutorial/05_simple_atomics/Makefile | 17 +- .../Advanced_Views/01_data_layouts/Makefile | 17 +- .../Advanced_Views/02_memory_traits/Makefile | 17 +- .../Advanced_Views/03_subviews/Makefile | 17 +- .../Advanced_Views/04_dualviews/Makefile | 17 +- .../Advanced_Views/04_dualviews/dual_view.cpp | 2 +- .../Advanced_Views/05_NVIDIA_UVM/Makefile | 17 +- .../05_NVIDIA_UVM/uvm_example.cpp | 22 +- .../Advanced_Views/06_AtomicViews/Makefile | 17 +- .../07_Overlapping_DeepCopy/Makefile | 17 +- .../example/tutorial/Advanced_Views/Makefile | 153 +- .../Algorithms/01_random_numbers/Makefile | 17 +- .../01_random_numbers/random_numbers.cpp | 10 +- .../example/tutorial/Algorithms/Makefile | 47 +- .../01_thread_teams/Makefile | 17 +- .../01_thread_teams_lambda/Makefile | 19 +- .../thread_teams_lambda.cpp | 5 +- .../02_nested_parallel_for/Makefile | 17 +- .../03_vectorization/Makefile | 17 +- .../04_team_scan/Makefile | 17 +- .../04_team_scan/team_scan.cpp | 5 +- .../Hierarchical_Parallelism/Makefile | 115 +- lib/kokkos/example/tutorial/Makefile | 238 +- lib/kokkos/generate_makefile.bash | 211 +- 359 files changed, 20077 insertions(+), 27456 deletions(-) delete mode 100644 lib/kokkos/.gitignore create mode 100644 lib/kokkos/CHANGELOG.md create mode 100644 lib/kokkos/benchmarks/bytes_and_flops/Makefile create mode 100644 lib/kokkos/benchmarks/bytes_and_flops/bench.hpp create mode 100644 lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp create mode 100644 lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp create mode 100644 lib/kokkos/benchmarks/bytes_and_flops/main.cpp create mode 100644 lib/kokkos/benchmarks/gather/Makefile create mode 100644 lib/kokkos/benchmarks/gather/gather.hpp create mode 100644 lib/kokkos/benchmarks/gather/gather_unroll.hpp rename lib/kokkos/{core/src/impl/Kokkos_HBWAllocators.cpp => benchmarks/gather/main.cpp} (54%) create mode 100755 lib/kokkos/bin/nvcc_wrapper mode change 100755 => 100644 lib/kokkos/config/configure_compton_cpu.sh mode change 100755 => 100644 lib/kokkos/config/configure_compton_mic.sh mode change 100755 => 100644 lib/kokkos/config/configure_kokkos.sh mode change 100755 => 100644 lib/kokkos/config/configure_kokkos_nvidia.sh mode change 100755 => 100644 lib/kokkos/config/configure_shannon.sh create mode 100755 lib/kokkos/config/trilinos-integration/prepare_trilinos_repos.sh create mode 100644 lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp delete mode 100644 lib/kokkos/containers/src/Kokkos_SegmentedView.hpp create mode 100644 lib/kokkos/containers/unit_tests/TestErrorReporter.hpp delete mode 100644 lib/kokkos/containers/unit_tests/TestSegmentedView.hpp delete mode 100644 lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp delete mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp delete mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp create mode 100644 lib/kokkos/core/src/Kokkos_TaskScheduler.hpp create mode 100644 lib/kokkos/core/src/Kokkos_Timer.hpp delete mode 100644 lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp delete mode 100644 lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_Shape.cpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_Shape.hpp rename lib/kokkos/core/src/impl/{KokkosExp_SharedAlloc.cpp => Kokkos_SharedAlloc.cpp} (85%) rename lib/kokkos/core/src/impl/{KokkosExp_SharedAlloc.hpp => Kokkos_SharedAlloc.hpp} (96%) create mode 100644 lib/kokkos/core/src/impl/Kokkos_Utilities.hpp rename lib/kokkos/core/src/impl/{KokkosExp_ViewArray.hpp => Kokkos_ViewArray.hpp} (96%) rename lib/kokkos/core/src/impl/{KokkosExp_ViewCtor.hpp => Kokkos_ViewCtor.hpp} (99%) delete mode 100644 lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp delete mode 100644 lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp rename lib/kokkos/core/src/impl/{KokkosExp_ViewTile.hpp => Kokkos_ViewTile.hpp} (92%) delete mode 100644 lib/kokkos/core/unit_test/TestAggregateReduction.hpp delete mode 100644 lib/kokkos/core/unit_test/TestCuda.cpp delete mode 100644 lib/kokkos/core/unit_test/TestCuda_a.cpp delete mode 100644 lib/kokkos/core/unit_test/TestCuda_b.cpp rename lib/kokkos/core/{src/impl/Kokkos_HBWAllocators.hpp => unit_test/TestDefaultDeviceType_b.cpp} (78%) rename lib/kokkos/core/unit_test/{TestMemorySpaceTracking.hpp => TestDefaultDeviceType_c.cpp} (67%) create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceType_d.cpp delete mode 100644 lib/kokkos/core/unit_test/TestOpenMP_c.cpp delete mode 100644 lib/kokkos/core/unit_test/TestSerial.cpp delete mode 100644 lib/kokkos/core/unit_test/TestTaskPolicy.hpp create mode 100644 lib/kokkos/core/unit_test/TestTaskScheduler.hpp delete mode 100644 lib/kokkos/core/unit_test/TestThreads.cpp create mode 100644 lib/kokkos/core/unit_test/TestUtilities.hpp delete mode 100644 lib/kokkos/core/unit_test/TestViewImpl.hpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda.hpp rename lib/kokkos/core/unit_test/{TestCuda_c.cpp => cuda/TestCuda_Atomics.cpp} (63%) create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_a.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_b.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp rename lib/kokkos/core/{src/impl/Kokkos_Singleton.hpp => unit_test/cuda/TestCuda_SubView_c04.cpp} (89%) create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_Team.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_a.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_c.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_d.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_e.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_f.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_g.cpp create mode 100644 lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_h.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp rename lib/kokkos/core/unit_test/{TestOpenMP.cpp => openmp/TestOpenMP_Atomics.cpp} (80%) create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp rename lib/kokkos/core/unit_test/{TestOpenMP_a.cpp => openmp/TestOpenMP_SubView_a.cpp} (70%) create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp rename lib/kokkos/core/{src/impl/Kokkos_ViewTileLeft.hpp => unit_test/openmp/TestOpenMP_SubView_c10.cpp} (89%) create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp rename lib/kokkos/core/unit_test/{TestOpenMP_b.cpp => openmp/TestOpenMP_Team.cpp} (52%) create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_a.cpp create mode 100644 lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial.hpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_Team.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_a.cpp create mode 100644 lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads.hpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_Team.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_a.cpp create mode 100644 lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp delete mode 100644 lib/kokkos/doc/README create mode 100644 lib/kokkos/doc/design_notes_space_instances.md delete mode 100644 lib/kokkos/example/ichol/Makefile delete mode 100644 lib/kokkos/example/ichol/example/example_chol_performance_device.hpp delete mode 100644 lib/kokkos/example/ichol/example/example_chol_performance_device_cuda.cpp delete mode 100644 lib/kokkos/example/ichol/example/example_chol_performance_device_pthread.cpp delete mode 100644 lib/kokkos/example/ichol/src/chol.hpp delete mode 100644 lib/kokkos/example/ichol/src/chol_u.hpp delete mode 100644 lib/kokkos/example/ichol/src/chol_u_right_look_by_blocks.hpp delete mode 100644 lib/kokkos/example/ichol/src/chol_u_unblocked_opt1.hpp delete mode 100644 lib/kokkos/example/ichol/src/chol_u_unblocked_opt2.hpp delete mode 100644 lib/kokkos/example/ichol/src/control.hpp delete mode 100644 lib/kokkos/example/ichol/src/coo.hpp delete mode 100644 lib/kokkos/example/ichol/src/crs_matrix_base.hpp delete mode 100644 lib/kokkos/example/ichol/src/crs_matrix_base_import.hpp delete mode 100644 lib/kokkos/example/ichol/src/crs_matrix_helper.hpp delete mode 100644 lib/kokkos/example/ichol/src/crs_matrix_helper_impl.hpp delete mode 100644 lib/kokkos/example/ichol/src/crs_matrix_view.hpp delete mode 100644 lib/kokkos/example/ichol/src/crs_row_view.hpp delete mode 100644 lib/kokkos/example/ichol/src/dot.hpp delete mode 100644 lib/kokkos/example/ichol/src/gemm.hpp delete mode 100644 lib/kokkos/example/ichol/src/gemm_ct_nt.hpp delete mode 100644 lib/kokkos/example/ichol/src/gemm_ct_nt_for_factor_blocked.hpp delete mode 100644 lib/kokkos/example/ichol/src/graph_helper_scotch.hpp delete mode 100644 lib/kokkos/example/ichol/src/herk.hpp delete mode 100644 lib/kokkos/example/ichol/src/herk_u_ct.hpp delete mode 100644 lib/kokkos/example/ichol/src/herk_u_ct_for_factor_blocked.hpp delete mode 100644 lib/kokkos/example/ichol/src/norm.hpp delete mode 100644 lib/kokkos/example/ichol/src/partition.hpp delete mode 100644 lib/kokkos/example/ichol/src/scale.hpp delete mode 100644 lib/kokkos/example/ichol/src/symbolic_factor_helper.hpp delete mode 100644 lib/kokkos/example/ichol/src/symbolic_task.hpp delete mode 100644 lib/kokkos/example/ichol/src/task_factory.hpp delete mode 100644 lib/kokkos/example/ichol/src/task_view.hpp delete mode 100644 lib/kokkos/example/ichol/src/trsm.hpp delete mode 100644 lib/kokkos/example/ichol/src/trsm_l_u_ct.hpp delete mode 100644 lib/kokkos/example/ichol/src/trsm_l_u_ct_for_factor_blocked.hpp delete mode 100644 lib/kokkos/example/ichol/src/util.cpp delete mode 100644 lib/kokkos/example/ichol/src/util.hpp diff --git a/lib/kokkos/.gitignore b/lib/kokkos/.gitignore deleted file mode 100644 index f9d16be155..0000000000 --- a/lib/kokkos/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Standard ignores -*~ -*.pyc -\#*# -.#* -.*.swp -.cproject -.project diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md new file mode 100644 index 0000000000..a444f08eed --- /dev/null +++ b/lib/kokkos/CHANGELOG.md @@ -0,0 +1,284 @@ +# Change Log + +## [2.02.07](https://github.com/kokkos/kokkos/tree/2.02.07) (2016-12-16) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.01...2.02.07) + +**Implemented enhancements:** + +- Add CMake option to enable Cuda Lambda support [\#589](https://github.com/kokkos/kokkos/issues/589) +- Add CMake option to enable Cuda RDC support [\#588](https://github.com/kokkos/kokkos/issues/588) +- Add Initial Intel Sky Lake Xeon-HPC Compiler Support to Kokkos Make System [\#584](https://github.com/kokkos/kokkos/issues/584) +- Building Tutorial Examples [\#582](https://github.com/kokkos/kokkos/issues/582) +- Internal way for using ThreadVectorRange without TeamHandle [\#574](https://github.com/kokkos/kokkos/issues/574) +- Testing: Add testing for uvm and rdc [\#571](https://github.com/kokkos/kokkos/issues/571) +- Profiling: Add Memory Tracing and Region Markers [\#557](https://github.com/kokkos/kokkos/issues/557) +- nvcc\_wrapper not installed with Kokkos built with CUDA through CMake [\#543](https://github.com/kokkos/kokkos/issues/543) +- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541) +- Benchmarks: Add Gather benchmark [\#536](https://github.com/kokkos/kokkos/issues/536) +- Testing: add spot\_check option to test\_all\_sandia [\#535](https://github.com/kokkos/kokkos/issues/535) +- Deprecate Kokkos::Impl::VerifyExecutionCanAccessMemorySpace [\#527](https://github.com/kokkos/kokkos/issues/527) +- Add AtomicAdd support for 64bit float for Pascal [\#522](https://github.com/kokkos/kokkos/issues/522) +- Add Restrict and Aligned memory trait [\#517](https://github.com/kokkos/kokkos/issues/517) +- Kokkos Tests are Not Run using Compiler Optimization [\#501](https://github.com/kokkos/kokkos/issues/501) +- Add support for clang 3.7 w/ openmp backend [\#393](https://github.com/kokkos/kokkos/issues/393) +- Provide an error throw class [\#79](https://github.com/kokkos/kokkos/issues/79) + +**Fixed bugs:** + +- Cuda UVM Allocation test broken with UVM as default space [\#586](https://github.com/kokkos/kokkos/issues/586) +- Bug \(develop branch only\): multiple tests are now failing when forcing uvm usage. [\#570](https://github.com/kokkos/kokkos/issues/570) +- Error in generate\_makefile.sh for Kokkos when Compiler is Empty String/Fails [\#568](https://github.com/kokkos/kokkos/issues/568) +- XL 13.1.4 incorrect C++11 flag [\#553](https://github.com/kokkos/kokkos/issues/553) +- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541) +- Installing Library on MAC broken due to cp -u [\#539](https://github.com/kokkos/kokkos/issues/539) +- Intel Nightly Testing with Debug enabled fails [\#534](https://github.com/kokkos/kokkos/issues/534) + +## [2.02.01](https://github.com/kokkos/kokkos/tree/2.02.01) (2016-11-01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.00...2.02.01) + +**Implemented enhancements:** + +- Add Changelog generation to our process. [\#506](https://github.com/kokkos/kokkos/issues/506) + +**Fixed bugs:** + +- Test scratch\_request fails in Serial with Debug enabled [\#520](https://github.com/kokkos/kokkos/issues/520) +- Bug In BoundsCheck for DynRankView [\#516](https://github.com/kokkos/kokkos/issues/516) + +## [2.02.00](https://github.com/kokkos/kokkos/tree/2.02.00) (2016-10-30) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.10...2.02.00) + +**Implemented enhancements:** + +- Add PowerPC assembly for grabbing clock register in memory pool [\#511](https://github.com/kokkos/kokkos/issues/511) +- Add GCC 6.x support [\#508](https://github.com/kokkos/kokkos/issues/508) +- Test install and build against installed library [\#498](https://github.com/kokkos/kokkos/issues/498) +- Makefile.kokkos adds expt-extended-lambda to cuda build with clang [\#490](https://github.com/kokkos/kokkos/issues/490) +- Add top-level makefile option to just test kokkos-core unit-test [\#485](https://github.com/kokkos/kokkos/issues/485) +- Split and harmonize Object Files of Core UnitTests to increase build parallelism [\#484](https://github.com/kokkos/kokkos/issues/484) +- LayoutLeft to LayoutLeft subview for 3D and 4D views [\#473](https://github.com/kokkos/kokkos/issues/473) +- Add official Cuda 8.0 support [\#468](https://github.com/kokkos/kokkos/issues/468) +- Allow C++1Z Flag for Class Lambda capture [\#465](https://github.com/kokkos/kokkos/issues/465) +- Add Clang 4.0+ compilation of Cuda code [\#455](https://github.com/kokkos/kokkos/issues/455) +- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445) +- Add name of view to "View bounds error" [\#432](https://github.com/kokkos/kokkos/issues/432) +- Move Sort Binning Operators into Kokkos namespace [\#421](https://github.com/kokkos/kokkos/issues/421) +- TaskPolicy - generate error when attempt to use uninitialized [\#396](https://github.com/kokkos/kokkos/issues/396) +- Import WithoutInitializing and AllowPadding into Kokkos namespace [\#325](https://github.com/kokkos/kokkos/issues/325) +- TeamThreadRange requires begin, end to be the same type [\#305](https://github.com/kokkos/kokkos/issues/305) +- CudaUVMSpace should track \# allocations, due to CUDA limit on \# UVM allocations [\#300](https://github.com/kokkos/kokkos/issues/300) +- Remove old View and its infrastructure [\#259](https://github.com/kokkos/kokkos/issues/259) + +**Fixed bugs:** + +- Bug in TestCuda\_Other.cpp: most likely assembly inserted into Device code [\#515](https://github.com/kokkos/kokkos/issues/515) +- Cuda Compute Capability check of GPU is outdated [\#509](https://github.com/kokkos/kokkos/issues/509) +- multi\_scratch test with hwloc and pthreads seg-faults. [\#504](https://github.com/kokkos/kokkos/issues/504) +- generate\_makefile.bash: "make install" is broken [\#503](https://github.com/kokkos/kokkos/issues/503) +- make clean in Out of Source Build/Tests Does Not Work Correctly [\#502](https://github.com/kokkos/kokkos/issues/502) +- Makefiles for test and examples have issues in Cuda when CXX is not explicitly specified [\#497](https://github.com/kokkos/kokkos/issues/497) +- Dispatch lambda test directly inside GTEST macro doesn't work with nvcc [\#491](https://github.com/kokkos/kokkos/issues/491) +- UnitTests with HWLOC enabled fail if run with mpirun bound to a single core [\#489](https://github.com/kokkos/kokkos/issues/489) +- Failing Reducer Test on Mac with Pthreads [\#479](https://github.com/kokkos/kokkos/issues/479) +- make test Dumps Error with Clang Not Found [\#471](https://github.com/kokkos/kokkos/issues/471) +- OpenMP TeamPolicy member broadcast not using correct volatile shared variable [\#424](https://github.com/kokkos/kokkos/issues/424) +- TaskPolicy - generate error when attempt to use uninitialized [\#396](https://github.com/kokkos/kokkos/issues/396) +- New task policy implementation is pulling in old experimental code. [\#372](https://github.com/kokkos/kokkos/issues/372) +- MemoryPool unit test hangs on Power8 with GCC 6.1.0 [\#298](https://github.com/kokkos/kokkos/issues/298) + +## [2.01.10](https://github.com/kokkos/kokkos/tree/2.01.10) (2016-09-27) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.06...2.01.10) + +**Implemented enhancements:** + +- Enable Profiling by default in Tribits build [\#438](https://github.com/kokkos/kokkos/issues/438) +- parallel\_reduce\(0\), parallel\_scan\(0\) unit tests [\#436](https://github.com/kokkos/kokkos/issues/436) +- data\(\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351) +- Fix tutorials to track new Kokkos::View [\#323](https://github.com/kokkos/kokkos/issues/323) +- Rename team policy set\_scratch\_size. [\#195](https://github.com/kokkos/kokkos/issues/195) + +**Fixed bugs:** + +- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445) +- Makefile spits syntax error [\#435](https://github.com/kokkos/kokkos/issues/435) +- Kokkos::sort fails for view with all the same values [\#422](https://github.com/kokkos/kokkos/issues/422) +- Generic Reducers: can't accept inline constructed reducer [\#404](https://github.com/kokkos/kokkos/issues/404) +- data\\(\\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351) +- const subview of const view with compile time dimensions on Cuda backend [\#310](https://github.com/kokkos/kokkos/issues/310) +- Kokkos \(in Trilinos\) Causes Internal Compiler Error on CUDA 8.0.21-EA on POWER8 [\#307](https://github.com/kokkos/kokkos/issues/307) +- Core Oversubscription Detection Broken? [\#159](https://github.com/kokkos/kokkos/issues/159) + + +## [2.01.06](https://github.com/kokkos/kokkos/tree/2.01.06) (2016-09-02) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.00...2.01.06) + +**Implemented enhancements:** + +- Add "standard" reducers for lambda-supportable customized reduce [\#411](https://github.com/kokkos/kokkos/issues/411) +- TaskPolicy - single thread back-end execution [\#390](https://github.com/kokkos/kokkos/issues/390) +- Kokkos master clone tag [\#387](https://github.com/kokkos/kokkos/issues/387) +- Query memory requirements from task policy [\#378](https://github.com/kokkos/kokkos/issues/378) +- Output order of test\_atomic.cpp is confusing [\#373](https://github.com/kokkos/kokkos/issues/373) +- Missing testing for atomics [\#341](https://github.com/kokkos/kokkos/issues/341) +- Feature request for Kokkos to provide Kokkos::atomic\_fetch\_max and atomic\_fetch\_min [\#336](https://github.com/kokkos/kokkos/issues/336) +- TaskPolicy\ performance requires teams mapped to warps [\#218](https://github.com/kokkos/kokkos/issues/218) + +**Fixed bugs:** + +- Reduce with Teams broken for custom initialize [\#407](https://github.com/kokkos/kokkos/issues/407) +- Failing Kokkos build on Debian [\#402](https://github.com/kokkos/kokkos/issues/402) +- Failing Tests on NVIDIA Pascal GPUs [\#398](https://github.com/kokkos/kokkos/issues/398) +- Algorithms: fill\_random assumes dimensions fit in unsigned int [\#389](https://github.com/kokkos/kokkos/issues/389) +- Kokkos::subview with RandomAccess Memory Trait [\#385](https://github.com/kokkos/kokkos/issues/385) +- Build warning \(signed / unsigned comparison\) in Cuda implementation [\#365](https://github.com/kokkos/kokkos/issues/365) +- wrong results for a parallel\_reduce with CUDA8 / Maxwell50 [\#352](https://github.com/kokkos/kokkos/issues/352) +- Hierarchical parallelism - 3 level unit test [\#344](https://github.com/kokkos/kokkos/issues/344) +- Can I allocate a View w/ both WithoutInitializing & AllowPadding? [\#324](https://github.com/kokkos/kokkos/issues/324) +- subview View layout determination [\#309](https://github.com/kokkos/kokkos/issues/309) +- Unit tests with Cuda - Maxwell [\#196](https://github.com/kokkos/kokkos/issues/196) + +## [2.01.00](https://github.com/kokkos/kokkos/tree/2.01.00) (2016-07-21) +[Full Changelog](https://github.com/kokkos/kokkos/compare/End_C++98...2.01.00) + +**Implemented enhancements:** + +- Edit ViewMapping so assigning Views with the same custom layout compiles when const casting [\#327](https://github.com/kokkos/kokkos/issues/327) +- DynRankView: Performance improvement for operator\(\) [\#321](https://github.com/kokkos/kokkos/issues/321) +- Interoperability between static and dynamic rank views [\#295](https://github.com/kokkos/kokkos/issues/295) +- subview member function ? [\#280](https://github.com/kokkos/kokkos/issues/280) +- Inter-operatibility between View and DynRankView. [\#245](https://github.com/kokkos/kokkos/issues/245) +- \(Trilinos\) build warning in atomic\_assign, with Kokkos::complex [\#177](https://github.com/kokkos/kokkos/issues/177) +- View\<\>::shmem\_size should runtime check for number of arguments equal to rank [\#176](https://github.com/kokkos/kokkos/issues/176) +- Custom reduction join via lambda argument [\#99](https://github.com/kokkos/kokkos/issues/99) +- DynRankView with 0 dimensions passed in at construction [\#293](https://github.com/kokkos/kokkos/issues/293) +- Inject view\_alloc and friends into Kokkos namespace [\#292](https://github.com/kokkos/kokkos/issues/292) +- Less restrictive TeamPolicy reduction on Cuda [\#286](https://github.com/kokkos/kokkos/issues/286) +- deep\_copy using remap with source execution space [\#267](https://github.com/kokkos/kokkos/issues/267) +- Suggestion: Enable opt-in L1 caching via nvcc-wrapper [\#261](https://github.com/kokkos/kokkos/issues/261) +- More flexible create\_mirror functions [\#260](https://github.com/kokkos/kokkos/issues/260) +- Rename View::memory\_span to View::required\_allocation\_size [\#256](https://github.com/kokkos/kokkos/issues/256) +- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237) +- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237) +- Kokkos::Timer [\#234](https://github.com/kokkos/kokkos/issues/234) +- Fence CudaUVMSpace allocations [\#230](https://github.com/kokkos/kokkos/issues/230) +- View::operator\(\) accept std::is\_integral and std::is\_enum [\#227](https://github.com/kokkos/kokkos/issues/227) +- Allocating zero size View [\#216](https://github.com/kokkos/kokkos/issues/216) +- Thread scalable memory pool [\#212](https://github.com/kokkos/kokkos/issues/212) +- Add a way to disable memory leak output [\#194](https://github.com/kokkos/kokkos/issues/194) +- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192) +- Runtime rank wrapper for View [\#189](https://github.com/kokkos/kokkos/issues/189) +- Profiling Interface [\#158](https://github.com/kokkos/kokkos/issues/158) +- Fix View assignment \(of managed to unmanaged\) [\#153](https://github.com/kokkos/kokkos/issues/153) +- Add unit test for assignment of managed View to unmanaged View [\#152](https://github.com/kokkos/kokkos/issues/152) +- Check for oversubscription of threads with MPI in Kokkos::initialize [\#149](https://github.com/kokkos/kokkos/issues/149) +- Dynamic resizeable 1dimensional view [\#143](https://github.com/kokkos/kokkos/issues/143) +- Develop TaskPolicy for CUDA [\#142](https://github.com/kokkos/kokkos/issues/142) +- New View : Test Compilation Downstream [\#138](https://github.com/kokkos/kokkos/issues/138) +- New View Implementation [\#135](https://github.com/kokkos/kokkos/issues/135) +- Add variant of subview that lets users add traits [\#134](https://github.com/kokkos/kokkos/issues/134) +- NVCC-WRAPPER: Add --host-only flag [\#121](https://github.com/kokkos/kokkos/issues/121) +- Address gtest issue with TriBITS Kokkos build outside of Trilinos [\#117](https://github.com/kokkos/kokkos/issues/117) +- Make tests pass with -expt-extended-lambda on CUDA [\#108](https://github.com/kokkos/kokkos/issues/108) +- Dynamic scheduling for parallel\_for and parallel\_reduce [\#106](https://github.com/kokkos/kokkos/issues/106) +- Runtime or compile time error when reduce functor's join is not properly specified as const member function or with volatile arguments [\#105](https://github.com/kokkos/kokkos/issues/105) +- Error out when the number of threads is modified after kokkos is initialized [\#104](https://github.com/kokkos/kokkos/issues/104) +- Porting to POWER and remove assumption of X86 default [\#103](https://github.com/kokkos/kokkos/issues/103) +- Dynamic scheduling option for RangePolicy [\#100](https://github.com/kokkos/kokkos/issues/100) +- SharedMemory Support for Lambdas [\#81](https://github.com/kokkos/kokkos/issues/81) +- Recommended TeamSize for Lambdas [\#80](https://github.com/kokkos/kokkos/issues/80) +- Add Aggressive Vectorization Compilation mode [\#72](https://github.com/kokkos/kokkos/issues/72) +- Dynamic scheduling team execution policy [\#53](https://github.com/kokkos/kokkos/issues/53) +- UVM allocations in multi-GPU systems [\#50](https://github.com/kokkos/kokkos/issues/50) +- Synchronic in Kokkos::Impl [\#44](https://github.com/kokkos/kokkos/issues/44) +- index and dimension types in for loops [\#28](https://github.com/kokkos/kokkos/issues/28) +- Subview assign of 1D Strided with stride 1 to LayoutLeft/Right [\#1](https://github.com/kokkos/kokkos/issues/1) + +**Fixed bugs:** + +- misspelled variable name in Kokkos\_Atomic\_Fetch + missing unit tests [\#340](https://github.com/kokkos/kokkos/issues/340) +- seg fault Kokkos::Impl::CudaInternal::print\_configuration [\#338](https://github.com/kokkos/kokkos/issues/338) +- Clang compiler error with named parallel\_reduce, tags, and TeamPolicy. [\#335](https://github.com/kokkos/kokkos/issues/335) +- Shared Memory Allocation Error at parallel\_reduce [\#311](https://github.com/kokkos/kokkos/issues/311) +- DynRankView: Fix resize and realloc [\#303](https://github.com/kokkos/kokkos/issues/303) +- Scratch memory and dynamic scheduling [\#279](https://github.com/kokkos/kokkos/issues/279) +- MemoryPool infinite loop when out of memory [\#312](https://github.com/kokkos/kokkos/issues/312) +- Kokkos DynRankView changes break Sacado and Panzer [\#299](https://github.com/kokkos/kokkos/issues/299) +- MemoryPool fails to compile on non-cuda non-x86 [\#297](https://github.com/kokkos/kokkos/issues/297) +- Random Number Generator Fix [\#296](https://github.com/kokkos/kokkos/issues/296) +- View template parameter ordering Bug [\#282](https://github.com/kokkos/kokkos/issues/282) +- Serial task policy broken. [\#281](https://github.com/kokkos/kokkos/issues/281) +- deep\_copy with LayoutStride should not memcpy [\#262](https://github.com/kokkos/kokkos/issues/262) +- DualView::need\_sync should be a const method [\#248](https://github.com/kokkos/kokkos/issues/248) +- Arbitrary-sized atomics on GPUs broken; loop forever [\#238](https://github.com/kokkos/kokkos/issues/238) +- boolean reduction value\_type changes answer [\#225](https://github.com/kokkos/kokkos/issues/225) +- Custom init\(\) function for parallel\_reduce with array value\_type [\#210](https://github.com/kokkos/kokkos/issues/210) +- unit\_test Makefile is Broken - Recursively Calls itself until Machine Apocalypse. [\#202](https://github.com/kokkos/kokkos/issues/202) +- nvcc\_wrapper Does Not Support -Xcompiler \ [\#198](https://github.com/kokkos/kokkos/issues/198) +- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192) +- Kokkos Threads Backend impl\_shared\_alloc Broken on Intel 16.1 \(Shepard Haswell\) [\#186](https://github.com/kokkos/kokkos/issues/186) +- pthread back end hangs if used uninitialized [\#182](https://github.com/kokkos/kokkos/issues/182) +- parallel\_reduce of size 0, not calling init/join [\#175](https://github.com/kokkos/kokkos/issues/175) +- Bug in Threads with OpenMP enabled [\#173](https://github.com/kokkos/kokkos/issues/173) +- KokkosExp\_SharedAlloc, m\_team\_work\_index inaccessible [\#166](https://github.com/kokkos/kokkos/issues/166) +- 128-bit CAS without Assembly Broken? [\#161](https://github.com/kokkos/kokkos/issues/161) +- fatal error: Cuda/Kokkos\_Cuda\_abort.hpp: No such file or directory [\#157](https://github.com/kokkos/kokkos/issues/157) +- Power8: Fix OpenMP backend [\#139](https://github.com/kokkos/kokkos/issues/139) +- Data race in Kokkos OpenMP initialization [\#131](https://github.com/kokkos/kokkos/issues/131) +- parallel\_launch\_local\_memory and cuda 7.5 [\#125](https://github.com/kokkos/kokkos/issues/125) +- Resize can fail with Cuda due to asynchronous dispatch [\#119](https://github.com/kokkos/kokkos/issues/119) +- Qthread taskpolicy initialization bug. [\#92](https://github.com/kokkos/kokkos/issues/92) +- Windows: sys/mman.h [\#89](https://github.com/kokkos/kokkos/issues/89) +- Windows: atomic\_fetch\_sub\(\) [\#88](https://github.com/kokkos/kokkos/issues/88) +- Windows: snprintf [\#87](https://github.com/kokkos/kokkos/issues/87) +- Parallel\_Reduce with TeamPolicy and league size of 0 returns garbage [\#85](https://github.com/kokkos/kokkos/issues/85) +- Throw with Cuda when using \(2D\) team\_policy parallel\_reduce with less than a warp size [\#76](https://github.com/kokkos/kokkos/issues/76) +- Scalar views don't work with Kokkos::Atomic memory trait [\#69](https://github.com/kokkos/kokkos/issues/69) +- Reduce the number of threads per team for Cuda [\#63](https://github.com/kokkos/kokkos/issues/63) +- Named Kernels fail for reductions with CUDA [\#60](https://github.com/kokkos/kokkos/issues/60) +- Kokkos View dimension\_\(\) for long returning unsigned int [\#20](https://github.com/kokkos/kokkos/issues/20) +- atomic test hangs with LLVM [\#6](https://github.com/kokkos/kokkos/issues/6) +- OpenMP Test should set omp\_set\_num\_threads to 1 [\#4](https://github.com/kokkos/kokkos/issues/4) + +**Closed issues:** + +- develop branch broken with CUDA 8 and --expt-extended-lambda [\#354](https://github.com/kokkos/kokkos/issues/354) +- --arch=KNL with Intel 2016 build failure [\#349](https://github.com/kokkos/kokkos/issues/349) +- Error building with Cuda when passing -DKOKKOS\_CUDA\_USE\_LAMBDA to generate\_makefile.bash [\#343](https://github.com/kokkos/kokkos/issues/343) +- Can I safely use int indices in a 2-D View with capacity \> 2B? [\#318](https://github.com/kokkos/kokkos/issues/318) +- Kokkos::ViewAllocateWithoutInitializing is not working [\#317](https://github.com/kokkos/kokkos/issues/317) +- Intel build on Mac OS X [\#277](https://github.com/kokkos/kokkos/issues/277) +- deleted [\#271](https://github.com/kokkos/kokkos/issues/271) +- Broken Mira build [\#268](https://github.com/kokkos/kokkos/issues/268) +- 32-bit build [\#246](https://github.com/kokkos/kokkos/issues/246) +- parallel\_reduce with RDC crashes linker [\#232](https://github.com/kokkos/kokkos/issues/232) +- build of Kokkos\_Sparse\_MV\_impl\_spmv\_Serial.cpp.o fails if you use nvcc and have cuda disabled [\#209](https://github.com/kokkos/kokkos/issues/209) +- Kokkos Serial execution space is not tested with TeamPolicy. [\#207](https://github.com/kokkos/kokkos/issues/207) +- Unit test failure on Hansen KokkosCore\_UnitTest\_Cuda\_MPI\_1 [\#200](https://github.com/kokkos/kokkos/issues/200) +- nvcc compiler warning: calling a \_\_host\_\_ function from a \_\_host\_\_ \_\_device\_\_ function is not allowed [\#180](https://github.com/kokkos/kokkos/issues/180) +- Intel 15 build error with defaulted "move" operators [\#171](https://github.com/kokkos/kokkos/issues/171) +- missing libkokkos.a during Trilinos 12.4.2 build, yet other libkokkos\*.a libs are there [\#165](https://github.com/kokkos/kokkos/issues/165) +- Tie atomic updates to execution space or even to thread team? \(speculation\) [\#144](https://github.com/kokkos/kokkos/issues/144) +- New View: Compiletime/size Test [\#137](https://github.com/kokkos/kokkos/issues/137) +- New View : Performance Test [\#136](https://github.com/kokkos/kokkos/issues/136) +- Signed/unsigned comparison warning in CUDA parallel [\#130](https://github.com/kokkos/kokkos/issues/130) +- Kokkos::complex: Need op\* w/ std::complex & real [\#126](https://github.com/kokkos/kokkos/issues/126) +- Use uintptr\_t for casting pointers [\#110](https://github.com/kokkos/kokkos/issues/110) +- Default thread mapping behavior between P and Q threads. [\#91](https://github.com/kokkos/kokkos/issues/91) +- Windows: Atomic\_Fetch\_Exchange\(\) return type [\#90](https://github.com/kokkos/kokkos/issues/90) +- Synchronic unit test is way too long [\#84](https://github.com/kokkos/kokkos/issues/84) +- nvcc\_wrapper -\> $\(NVCC\_WRAPPER\) [\#42](https://github.com/kokkos/kokkos/issues/42) +- Check compiler version and print helpful message [\#39](https://github.com/kokkos/kokkos/issues/39) +- Kokkos shared memory on Cuda uses a lot of registers [\#31](https://github.com/kokkos/kokkos/issues/31) +- Can not pass unit test `cuda.space` without a GT 720 [\#25](https://github.com/kokkos/kokkos/issues/25) +- Makefile.kokkos lacks bounds checking option that CMake has [\#24](https://github.com/kokkos/kokkos/issues/24) +- Kokkos can not complete unit tests with CUDA UVM enabled [\#23](https://github.com/kokkos/kokkos/issues/23) +- Simplify teams + shared memory histogram example to remove vectorization [\#21](https://github.com/kokkos/kokkos/issues/21) +- Kokkos needs to rever to ${PROJECT\_NAME}\_ENABLE\_CXX11 not Trilinos\_ENABLE\_CXX11 [\#17](https://github.com/kokkos/kokkos/issues/17) +- Kokkos Base Makefile adds AVX to KNC Build [\#16](https://github.com/kokkos/kokkos/issues/16) +- MS Visual Studio 2013 Build Errors [\#9](https://github.com/kokkos/kokkos/issues/9) +- subview\(X, ALL\(\), j\) for 2-D LayoutRight View X: should it view a column? [\#5](https://github.com/kokkos/kokkos/issues/5) + +## [End_C++98](https://github.com/kokkos/kokkos/tree/End_C++98) (2015-04-15) + + +\* *This Change Log was automatically generated by [github_changelog_generator](https://github.com/skywinder/Github-Changelog-Generator)* diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 1219352f73..2b2b9be6aa 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -34,8 +34,8 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS) # for compatibility with Kokkos' Makefile build system. TRIBITS_ADD_OPTION_AND_DEFINE( - ${PACKAGE_NAME}_ENABLE_DEBUG - ${PACKAGE_NAME_UC}_HAVE_DEBUG + Kokkos_ENABLE_DEBUG + KOKKOS_HAVE_DEBUG "Enable run-time debug checks. These checks may be expensive, so they are disabled by default in a release build." ${${PROJECT_NAME}_ENABLE_DEBUG} ) @@ -57,7 +57,21 @@ TRIBITS_ADD_OPTION_AND_DEFINE( TRIBITS_ADD_OPTION_AND_DEFINE( Kokkos_ENABLE_Cuda_UVM KOKKOS_USE_CUDA_UVM - "Enable CUDA Unified Virtual Memory support in Kokkos." + "Enable CUDA Unified Virtual Memory as the default in Kokkos." + OFF + ) + +TRIBITS_ADD_OPTION_AND_DEFINE( + Kokkos_ENABLE_Cuda_RDC + KOKKOS_HAVE_CUDA_RDC + "Enable CUDA Relocatable Device Code support in Kokkos." + OFF + ) + +TRIBITS_ADD_OPTION_AND_DEFINE( + Kokkos_ENABLE_Cuda_Lambda + KOKKOS_HAVE_CUDA_LAMBDA + "Enable CUDA LAMBDA support in Kokkos." OFF ) @@ -72,6 +86,9 @@ ASSERT_DEFINED(TPL_ENABLE_Pthread) IF (Kokkos_ENABLE_Pthread AND NOT TPL_ENABLE_Pthread) MESSAGE(FATAL_ERROR "You set Kokkos_ENABLE_Pthread=ON, but Trilinos' support for Pthread(s) is not enabled (TPL_ENABLE_Pthread=OFF). This is not allowed. Please enable Pthreads in Trilinos before attempting to enable Kokkos' support for Pthreads.") ENDIF () +IF (NOT TPL_ENABLE_Pthread) + ADD_DEFINITIONS(-DGTEST_HAS_PTHREAD=0) +ENDIF() TRIBITS_ADD_OPTION_AND_DEFINE( Kokkos_ENABLE_OpenMP @@ -162,13 +179,28 @@ TRIBITS_ADD_OPTION_AND_DEFINE( #------------------------------------------------------------------------------ # -# C) Process the subpackages for Kokkos +# C) Install Kokkos' executable scripts +# + + +# nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler. +# Kokkos needs nvcc_wrapper in order to build. Other libraries and +# executables also need nvcc_wrapper. Thus, we need to install it. +# If the argument of DESTINATION is a relative path, CMake computes it +# as relative to ${CMAKE_INSTALL_PATH}. + +INSTALL(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper DESTINATION bin) + + +#------------------------------------------------------------------------------ +# +# D) Process the subpackages for Kokkos # TRIBITS_PROCESS_SUBPACKAGES() # -# D) If Kokkos itself is enabled, process the Kokkos package +# E) If Kokkos itself is enabled, process the Kokkos package # TRIBITS_PACKAGE_DEF() diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 73a332ee11..da51b37b14 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -7,13 +7,13 @@ CXXFLAGS=$(CCFLAGS) #Options: OpenMP,Serial,Pthreads,Cuda KOKKOS_DEVICES ?= "OpenMP" #KOKKOS_DEVICES ?= "Pthreads" -#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW +#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,KNL,BDW,SKX KOKKOS_ARCH ?= "" #Options: yes,no KOKKOS_DEBUG ?= "no" #Options: hwloc,librt,experimental_memkind KOKKOS_USE_TPLS ?= "" -#Options: c++11 +#Options: c++11,c++1z KOKKOS_CXX_STANDARD ?= "c++11" #Options: aggressive_vectorization,disable_profiling KOKKOS_OPTIONS ?= "" @@ -26,6 +26,7 @@ KOKKOS_CUDA_OPTIONS ?= "" KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l)) KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l)) +KOKKOS_INTERNAL_ENABLE_CXX1Z := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++1z" | wc -l)) # Check for external libraries KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l)) @@ -53,23 +54,65 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0) endif endif +# Check for other Execution Spaces + +KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l)) + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) + CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=) + KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .) +endif + +# Check OS + +KOKKOS_OS := $(shell uname -s) +KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname -s | grep CYGWIN | wc -l) +KOKKOS_INTERNAL_OS_LINUX := $(shell uname -s | grep Linux | wc -l) +KOKKOS_INTERNAL_OS_DARWIN := $(shell uname -s | grep Darwin | wc -l) + +# Check compiler + KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version 2>&1 | grep "Intel Corporation" | wc -l) KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version 2>&1 | grep PGI | wc -l) KOKKOS_INTERNAL_COMPILER_XL := $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l) KOKKOS_INTERNAL_COMPILER_CRAY := $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l) -KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname | grep CYGWIN | wc -l) +KOKKOS_INTERNAL_COMPILER_NVCC := $(shell $(CXX) --version 2>&1 | grep "nvcc" | wc -l) +KOKKOS_INTERNAL_COMPILER_CLANG := $(shell $(CXX) --version 2>&1 | grep "clang" | wc -l) + +ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2) + KOKKOS_INTERNAL_COMPILER_CLANG = 1 +endif +ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 2) + KOKKOS_INTERNAL_COMPILER_XL = 1 +endif + +ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.') + ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0) + $(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher) + endif + KOKKOS_INTERNAL_CUDA_USE_LAMBDA := 1 + endif +endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) KOKKOS_INTERNAL_OPENMP_FLAG := -mp else - ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) - KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp else - ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) - # OpenMP is turned on by default in Cray compiler environment - KOKKOS_INTERNAL_OPENMP_FLAG := + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp else - KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + # OpenMP is turned on by default in Cray compiler environment + KOKKOS_INTERNAL_OPENMP_FLAG := + else + KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp + endif endif endif endif @@ -84,13 +127,11 @@ else KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11 else KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11 + KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z endif endif endif -# Check for other Execution Spaces -KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l)) - # Check for Kokkos Architecture settings #Intel based @@ -98,6 +139,7 @@ KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l)) KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l)) KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_SKX := $(strip $(shell echo $(KOKKOS_ARCH) | grep SKX | wc -l)) KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l)) #NVIDIA based @@ -110,11 +152,13 @@ KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l)) KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l)) KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal60 | wc -l)) KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \ + + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc)) @@ -127,13 +171,16 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \ + + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc)) endif #ARM based -KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv80 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv81 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8-ThunderX | wc -l)) #IBM based KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l)) @@ -145,17 +192,18 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l)) #Any AVX? -KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc )) -KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc )) -KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) +KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc )) +KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc )) +KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) +KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) # Decide what ISA level we are able to support -KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) +KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc )) KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc )) #Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)>1" | bc )) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)>1" | bc )) KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc)) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) @@ -207,15 +255,21 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1) + tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_USE_ISA_X86_64" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp ) endif ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1) + tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_USE_ISA_KNC" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp ) endif ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1) + tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCLE" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp ) endif ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1) @@ -230,9 +284,15 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1) tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp ) endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG) + tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#define KOKKOS_HAVE_CXX1Z 1" >> KokkosCore_config.tmp ) +endif + ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) -ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - KOKKOS_CXXFLAGS += -G +ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + KOKKOS_CXXFLAGS += -lineinfo endif KOKKOS_CXXFLAGS += -g KOKKOS_LDFLAGS += -g -ldl @@ -273,13 +333,14 @@ endif tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp) +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1) tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp ) endif ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp ) endif ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1) @@ -289,27 +350,101 @@ ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1) endif ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1) - tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -expt-extended-lambda + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0) + tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp ) + KOKKOS_CXXFLAGS += -expt-extended-lambda + else + $(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.) + endif + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp ) + endif +endif endif #Add Architecture flags -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp ) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1) + tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp ) ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += KOKKOS_LDFLAGS += - else - KOKKOS_CXXFLAGS += -mavx - KOKKOS_LDFLAGS += -mavx + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + KOKKOS_CXXFLAGS += -march=armv8-a + KOKKOS_LDFLAGS += -march=armv8-a + endif endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1) + tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp ) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + KOKKOS_CXXFLAGS += -march=armv8.1-a + KOKKOS_LDFLAGS += -march=armv8.1-a + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) + tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp ) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + KOKKOS_CXXFLAGS += -march=armv8-a -mtune=thunderx + KOKKOS_LDFLAGS += -march=armv8-a -mtune=thunderx + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) + tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp ) + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -mavx + KOKKOS_LDFLAGS += -mavx + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += -tp=sandybridge + KOKKOS_LDFLAGS += -tp=sandybridge + else + # Assume that this is a really a GNU compiler + KOKKOS_CXXFLAGS += -mavx + KOKKOS_LDFLAGS += -mavx + endif + endif + endif +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 - KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + + else + # Assume that this is a really a GNU compiler or it could be XL on P8 + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + endif endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1) @@ -322,7 +457,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1) else ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) - + KOKKOS_CXXFLAGS += -tp=haswell + KOKKOS_LDFLAGS += -tp=haswell else # Assume that this is a really a GNU compiler KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2 @@ -352,52 +488,85 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1) + tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp ) + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -xCORE-AVX512 + KOKKOS_LDFLAGS += -xCORE-AVX512 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + + else + # Nothing here yet + KOKKOS_CXXFLAGS += -march=skylake-avx512 + KOKKOS_LDFLAGS += -march=skylake-avx512 + endif + endif + endif +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_KNC 1" >> KokkosCore_config.tmp ) KOKKOS_CXXFLAGS += -mmic KOKKOS_LDFLAGS += -mmic endif +#Figure out the architecture flag for Cuda ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) +ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-arch +endif +ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-x cuda --cuda-gpu-arch +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -arch=sm_30 + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -arch=sm_32 + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -arch=sm_35 + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -arch=sm_37 + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -arch=sm_50 + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -arch=sm_52 + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -arch=sm_53 + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp ) - KOKKOS_CXXFLAGS += -arch=sm_61 + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) + tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp ) + tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp ) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60 endif endif @@ -424,6 +593,7 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp) ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp) + KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 KOKKOS_LIBS += -lcudart -lcuda endif @@ -443,7 +613,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp) - ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG) else KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG) @@ -451,6 +621,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG) endif +#Explicitly set the GCC Toolchain for Clang +ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_GCC_PATH = $(shell which g++) + KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=) + KOKKOS_CXXFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN) -DKOKKOS_CUDA_CLANG_WORKAROUND -DKOKKOS_CUDA_USE_LDG_INTRINSIC + KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN) +endif + #With Cygwin functions such as fdopen and fileno are not defined #when strict ansi is enabled. strict ansi gets enabled with --std=c++11 #though. So we hard undefine it here. Not sure if that has any bad side effects @@ -471,7 +649,7 @@ KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) include $(KOKKOS_PATH)/Makefile.targets kokkos-clean: - -rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a + rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) ar cr libkokkos.a $(KOKKOS_OBJ_LINK) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index 86929ea0fe..a48a5f6eb7 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -14,20 +14,16 @@ Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc. $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp -Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp -Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp -Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp +Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp -KokkosExp_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp +Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp @@ -38,8 +34,6 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp -Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) @@ -47,8 +41,6 @@ Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp -Kokkos_Threads_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1) @@ -67,6 +59,4 @@ endif Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp -Kokkos_HBWAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp diff --git a/lib/kokkos/README b/lib/kokkos/README index b094578af6..ffc1fe53b5 100644 --- a/lib/kokkos/README +++ b/lib/kokkos/README @@ -45,31 +45,32 @@ Primary tested compilers on X86 are: Intel 14.0.4 Intel 15.0.2 Intel 16.0.1 + Intel 17.0.098 Clang 3.5.2 Clang 3.6.1 + Clang 3.9.0 Primary tested compilers on Power 8 are: - IBM XL 13.1.3 (OpenMP,Serial) - GCC 4.9.2 (OpenMP,Serial) - GCC 5.3.0 (OpenMP,Serial) + GCC 5.4.0 (OpenMP,Serial) + IBM XL 13.1.3 (OpenMP, Serial) (There is a workaround in place to avoid a compiler bug) + +Primary tested compilers on Intel KNL are: + Intel 16.2.181 (with gcc 4.7.2) + Intel 17.0.098 (with gcc 4.7.2) Secondary tested compilers are: - CUDA 6.5 (with gcc 4.7.2) CUDA 7.0 (with gcc 4.7.2) - CUDA 7.5 (with gcc 4.8.4) + CUDA 7.5 (with gcc 4.7.2) + CUDA 8.0 (with gcc 5.3.0 on X86 and gcc 5.4.0 on Power8) + CUDA/Clang 8.0 using Clang/Trunk compiler Other compilers working: X86: - Intel 17.0.042 (the FENL example causes internal compiler error) PGI 15.4 Cygwin 2.1.0 64bit with gcc 4.9.3 - KNL: - Intel 16.2.181 (the FENL example causes internal compiler error) - Intel 17.0.042 (the FENL example causes internal compiler error) Known non-working combinations: Power8: - GCC 6.1.0 Pthreads backend @@ -92,9 +93,10 @@ master branch, without -Werror and only for a select set of backends. In the 'example/tutorial' directory you will find step by step tutorial examples which explain many of the features of Kokkos. They work with -simple Makefiles. To build with g++ and OpenMP simply type 'make openmp' +simple Makefiles. To build with g++ and OpenMP simply type 'make' in the 'example/tutorial' directory. This will build all examples in the -subfolders. +subfolders. To change the build options refer to the Programming Guide +in the compilation section. ============================================================================ ====Running Unit Tests====================================================== diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index d7c06dc14b..78cddeeaec 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -476,54 +476,54 @@ namespace Kokkos { }; template - struct rand > { + struct rand > { KOKKOS_INLINE_FUNCTION - static ::Kokkos::complex max () { - return ::Kokkos::complex (1.0, 1.0); + static Kokkos::complex max () { + return Kokkos::complex (1.0, 1.0); } KOKKOS_INLINE_FUNCTION - static ::Kokkos::complex draw (Generator& gen) { + static Kokkos::complex draw (Generator& gen) { const float re = gen.frand (); const float im = gen.frand (); - return ::Kokkos::complex (re, im); + return Kokkos::complex (re, im); } KOKKOS_INLINE_FUNCTION - static ::Kokkos::complex draw (Generator& gen, const ::Kokkos::complex& range) { + static Kokkos::complex draw (Generator& gen, const Kokkos::complex& range) { const float re = gen.frand (real (range)); const float im = gen.frand (imag (range)); - return ::Kokkos::complex (re, im); + return Kokkos::complex (re, im); } KOKKOS_INLINE_FUNCTION - static ::Kokkos::complex draw (Generator& gen, const ::Kokkos::complex& start, const ::Kokkos::complex& end) { + static Kokkos::complex draw (Generator& gen, const Kokkos::complex& start, const Kokkos::complex& end) { const float re = gen.frand (real (start), real (end)); const float im = gen.frand (imag (start), imag (end)); - return ::Kokkos::complex (re, im); + return Kokkos::complex (re, im); } }; template - struct rand > { + struct rand > { KOKKOS_INLINE_FUNCTION - static ::Kokkos::complex max () { - return ::Kokkos::complex (1.0, 1.0); + static Kokkos::complex max () { + return Kokkos::complex (1.0, 1.0); } KOKKOS_INLINE_FUNCTION - static ::Kokkos::complex draw (Generator& gen) { + static Kokkos::complex draw (Generator& gen) { const double re = gen.drand (); const double im = gen.drand (); - return ::Kokkos::complex (re, im); + return Kokkos::complex (re, im); } KOKKOS_INLINE_FUNCTION - static ::Kokkos::complex draw (Generator& gen, const ::Kokkos::complex& range) { + static Kokkos::complex draw (Generator& gen, const Kokkos::complex& range) { const double re = gen.drand (real (range)); const double im = gen.drand (imag (range)); - return ::Kokkos::complex (re, im); + return Kokkos::complex (re, im); } KOKKOS_INLINE_FUNCTION - static ::Kokkos::complex draw (Generator& gen, const ::Kokkos::complex& start, const ::Kokkos::complex& end) { + static Kokkos::complex draw (Generator& gen, const Kokkos::complex& start, const Kokkos::complex& end) { const double re = gen.drand (real (start), real (end)); const double im = gen.drand (imag (start), imag (end)); - return ::Kokkos::complex (re, im); + return Kokkos::complex (re, im); } }; @@ -670,8 +670,8 @@ namespace Kokkos { double S = 2.0; double U; while(S>=1.0) { - U = drand(); - const double V = drand(); + U = 2.0*drand() - 1.0; + const double V = 2.0*drand() - 1.0; S = U*U+V*V; } return U*sqrt(-2.0*log(S)/S); @@ -910,8 +910,8 @@ namespace Kokkos { double S = 2.0; double U; while(S>=1.0) { - U = drand(); - const double V = drand(); + U = 2.0*drand() - 1.0; + const double V = 2.0*drand() - 1.0; S = U*U+V*V; } return U*sqrt(-2.0*log(S)/S); @@ -1163,8 +1163,8 @@ namespace Kokkos { double S = 2.0; double U; while(S>=1.0) { - U = drand(); - const double V = drand(); + U = 2.0*drand() - 1.0; + const double V = 2.0*drand() - 1.0; S = U*U+V*V; } return U*sqrt(-2.0*log(S)/S); diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp index 6123ce978c..5b8c65fee1 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -51,7 +51,7 @@ namespace Kokkos { - namespace SortImpl { + namespace Impl { template struct CopyOp; @@ -199,7 +199,7 @@ public: parallel_for(values.dimension_0(), bin_sort_sort_functor >(values,sorted_values,sort_order)); + Impl::CopyOp >(values,sorted_values,sort_order)); deep_copy(values,sorted_values); } @@ -262,17 +262,15 @@ public: } }; -namespace SortImpl { - template -struct DefaultBinOp1D { +struct BinOp1D { const int max_bins_; const double mul_; typename KeyViewType::const_value_type range_; typename KeyViewType::const_value_type min_; //Construct BinOp with number of bins, minimum value and maxuimum value - DefaultBinOp1D(int max_bins__, typename KeyViewType::const_value_type min, + BinOp1D(int max_bins__, typename KeyViewType::const_value_type min, typename KeyViewType::const_value_type max ) :max_bins_(max_bins__+1),mul_(1.0*max_bins__/(max-min)),range_(max-min),min_(min) {} @@ -298,13 +296,13 @@ struct DefaultBinOp1D { }; template -struct DefaultBinOp3D { +struct BinOp3D { int max_bins_[3]; double mul_[3]; typename KeyViewType::non_const_value_type range_[3]; typename KeyViewType::non_const_value_type min_[3]; - DefaultBinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], + BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], typename KeyViewType::const_value_type max[] ) { max_bins_[0] = max_bins__[0]+1; @@ -348,109 +346,11 @@ struct DefaultBinOp3D { } }; -template -struct min_max { - Scalar min; - Scalar max; - bool init; - - KOKKOS_INLINE_FUNCTION - min_max() { - min = 0; - max = 0; - init = 0; - } - - KOKKOS_INLINE_FUNCTION - min_max (const min_max& val) { - min = val.min; - max = val.max; - init = val.init; - } - - KOKKOS_INLINE_FUNCTION - min_max operator = (const min_max& val) { - min = val.min; - max = val.max; - init = val.init; - return *this; - } - - KOKKOS_INLINE_FUNCTION - void operator+= (const Scalar& val) { - if(init) { - min = minval?max:val; - } else { - min = val; - max = val; - init = 1; - } - } - - KOKKOS_INLINE_FUNCTION - void operator+= (const min_max& val) { - if(init && val.init) { - min = minval.max?max:val.max; - } else { - if(val.init) { - min = val.min; - max = val.max; - init = 1; - } - } - } - - KOKKOS_INLINE_FUNCTION - void operator+= (volatile const Scalar& val) volatile { - if(init) { - min = minval?max:val; - } else { - min = val; - max = val; - init = 1; - } - } - - KOKKOS_INLINE_FUNCTION - void operator+= (volatile const min_max& val) volatile { - if(init && val.init) { - min = minval.max?max:val.max; - } else { - if(val.init) { - min = val.min; - max = val.max; - init = 1; - } - } - } -}; - - -template -struct min_max_functor { - typedef typename ViewType::execution_space execution_space; - ViewType view; - typedef min_max value_type; - min_max_functor (const ViewType view_):view(view_) { - } - - KOKKOS_INLINE_FUNCTION - void operator()(const size_t& i, value_type& val) const { - val += view(i); - } -}; +namespace Impl { template bool try_std_sort(ViewType view) { bool possible = true; -#if ! KOKKOS_USING_EXP_VIEW - size_t stride[8]; - view.stride(stride); -#else size_t stride[8] = { view.stride_0() , view.stride_1() , view.stride_2() @@ -460,8 +360,7 @@ bool try_std_sort(ViewType view) { , view.stride_6() , view.stride_7() }; -#endif - possible = possible && Impl::is_same::value; + possible = possible && std::is_same::value; possible = possible && (ViewType::Rank == 1); possible = possible && (stride[0] == 1); if(possible) { @@ -470,27 +369,39 @@ bool try_std_sort(ViewType view) { return possible; } +template +struct min_max_functor { + typedef Kokkos::Experimental::MinMaxScalar minmax_scalar; + + ViewType view; + min_max_functor(const ViewType& view_):view(view_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (const size_t& i, minmax_scalar& minmax) const { + if(view(i) < minmax.min_val) minmax.min_val = view(i); + if(view(i) > minmax.max_val) minmax.max_val = view(i); + } +}; + } template void sort(ViewType view, bool always_use_kokkos_sort = false) { if(!always_use_kokkos_sort) { - if(SortImpl::try_std_sort(view)) return; + if(Impl::try_std_sort(view)) return; } + typedef BinOp1D CompType; - typedef SortImpl::DefaultBinOp1D CompType; - SortImpl::min_max val; - parallel_reduce(view.dimension_0(),SortImpl::min_max_functor(view),val); - BinSort bin_sort(view,CompType(view.dimension_0()/2,val.min,val.max),true); + Kokkos::Experimental::MinMaxScalar result; + Kokkos::Experimental::MinMax reducer(result); + parallel_reduce(Kokkos::RangePolicy(0,view.dimension_0()), + Impl::min_max_functor(view),reducer); + if(result.min_val == result.max_val) return; + BinSort bin_sort(view,CompType(view.dimension_0()/2,result.min_val,result.max_val),true); bin_sort.create_permute_vector(); bin_sort.sort(view); } -/*template -void sort(ViewType view, Comparator comp, bool always_use_kokkos_sort = false) { - -}*/ - } #endif diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index 654104b44e..fde6b967e0 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -1,6 +1,6 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) SET(SOURCES diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index 5d79364c52..3027c6a94b 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/algorithms/unit_tests default: build_all echo "End Build" +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) + CXX = $(KOKKOS_PATH)/config/nvcc_wrapper +else + CXX = g++ +endif + +CXXFLAGS = -O3 +LINK ?= $(CXX) +LDFLAGS ?= -lpthread include $(KOKKOS_PATH)/Makefile.kokkos -ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - CXX = $(NVCC_WRAPPER) - CXXFLAGS ?= -O3 - LINK = $(CXX) - LDFLAGS ?= -lpthread -else - CXX ?= g++ - CXXFLAGS ?= -O3 - LINK ?= $(CXX) - LDFLAGS ?= -lpthread -endif - KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests TEST_TARGETS = diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp index ccbcbdd001..03e4fb691e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp @@ -131,6 +131,10 @@ void test_1D_sort(unsigned int n,bool force_kokkos) { typedef Kokkos::View KeyViewType; KeyViewType keys("Keys",n); + // Test sorting array with all numbers equal + Kokkos::deep_copy(keys,KeyType(1)); + Kokkos::sort(keys,force_kokkos); + Kokkos::Random_XorShift64_Pool g(1931); Kokkos::fill_random(keys,g,Kokkos::Random_XorShift64_Pool::generator_type::MAX_URAND); @@ -174,7 +178,7 @@ void test_3D_sort(unsigned int n) { typename KeyViewType::value_type min[3] = {0,0,0}; typename KeyViewType::value_type max[3] = {100,100,100}; - typedef Kokkos::SortImpl::DefaultBinOp3D< KeyViewType > BinOp; + typedef Kokkos::BinOp3D< KeyViewType > BinOp; BinOp bin_op(bin_max,min,max); Kokkos::BinSort< KeyViewType , BinOp > Sorter(keys,bin_op,false); diff --git a/lib/kokkos/benchmarks/bytes_and_flops/Makefile b/lib/kokkos/benchmarks/bytes_and_flops/Makefile new file mode 100644 index 0000000000..6a1917a523 --- /dev/null +++ b/lib/kokkos/benchmarks/bytes_and_flops/Makefile @@ -0,0 +1,43 @@ +KOKKOS_PATH = ${HOME}/kokkos +SRC = $(wildcard *.cpp) +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/config/nvcc_wrapper +EXE = bytes_and_flops.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +EXE = bytes_and_flops.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +CXXFLAGS = -O3 -g + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(SRC:.cpp=.o) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) bench.hpp bench_unroll_stride.hpp bench_stride.hpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp new file mode 100644 index 0000000000..e3fe42a652 --- /dev/null +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp @@ -0,0 +1,99 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +template +struct Run { +static void run(int N, int K, int R, int F, int T, int S); +}; + +template +struct RunStride { +static void run_1(int N, int K, int R, int F, int T, int S); +static void run_2(int N, int K, int R, int F, int T, int S); +static void run_3(int N, int K, int R, int F, int T, int S); +static void run_4(int N, int K, int R, int F, int T, int S); +static void run_5(int N, int K, int R, int F, int T, int S); +static void run_6(int N, int K, int R, int F, int T, int S); +static void run_7(int N, int K, int R, int F, int T, int S); +static void run_8(int N, int K, int R, int F, int T, int S); +static void run(int N, int K, int R, int U, int F, int T, int S); +}; + +#define STRIDE 1 +#include +#undef STRIDE +#define STRIDE 2 +#include +#undef STRIDE +#define STRIDE 4 +#include +#undef STRIDE +#define STRIDE 8 +#include +#undef STRIDE +#define STRIDE 16 +#include +#undef STRIDE +#define STRIDE 32 +#include +#undef STRIDE + +template +void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S) { + if(D == 1) + RunStride::run(N,K,R,U,F,T,S); + if(D == 2) + RunStride::run(N,K,R,U,F,T,S); + if(D == 4) + RunStride::run(N,K,R,U,F,T,S); + if(D == 8) + RunStride::run(N,K,R,U,F,T,S); + if(D == 16) + RunStride::run(N,K,R,U,F,T,S); + if(D == 32) + RunStride::run(N,K,R,U,F,T,S); +} + diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp new file mode 100644 index 0000000000..b60ec84994 --- /dev/null +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp @@ -0,0 +1,124 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#define UNROLL 1 +#include +#undef UNROLL +#define UNROLL 2 +#include +#undef UNROLL +#define UNROLL 3 +#include +#undef UNROLL +#define UNROLL 4 +#include +#undef UNROLL +#define UNROLL 5 +#include +#undef UNROLL +#define UNROLL 6 +#include +#undef UNROLL +#define UNROLL 7 +#include +#undef UNROLL +#define UNROLL 8 +#include +#undef UNROLL + +template +struct RunStride { +static void run_1(int N, int K, int R, int F, int T, int S) { + Run::run(N,K,R,F,T,S); +} +static void run_2(int N, int K, int R, int F, int T, int S) { + Run::run(N,K,R,F,T,S); +} +static void run_3(int N, int K, int R, int F, int T, int S) { + Run::run(N,K,R,F,T,S); +} +static void run_4(int N, int K, int R, int F, int T, int S) { + Run::run(N,K,R,F,T,S); +} +static void run_5(int N, int K, int R, int F, int T, int S) { + Run::run(N,K,R,F,T,S); +} +static void run_6(int N, int K, int R, int F, int T, int S) { + Run::run(N,K,R,F,T,S); +} +static void run_7(int N, int K, int R, int F, int T, int S) { + Run::run(N,K,R,F,T,S); +} +static void run_8(int N, int K, int R, int F, int T, int S) { + Run::run(N,K,R,F,T,S); +} + +static void run(int N, int K, int R, int U, int F, int T, int S) { + if(U==1) { + run_1(N,K,R,F,T,S); + } + if(U==2) { + run_2(N,K,R,F,T,S); + } + if(U==3) { + run_3(N,K,R,F,T,S); + } + if(U==4) { + run_4(N,K,R,F,T,S); + } + if(U==5) { + run_5(N,K,R,F,T,S); + } + if(U==6) { + run_6(N,K,R,F,T,S); + } + if(U==7) { + run_7(N,K,R,F,T,S); + } + if(U==8) { + run_8(N,K,R,F,T,S); + } +} +}; + diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp new file mode 100644 index 0000000000..0992c5b54b --- /dev/null +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -0,0 +1,148 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +template +struct Run { +static void run(int N, int K, int R, int F, int T, int S) { + Kokkos::View A("A",N,K); + Kokkos::View B("B",N,K); + Kokkos::View C("C",N,K); + + Kokkos::deep_copy(A,Scalar(1.5)); + Kokkos::deep_copy(B,Scalar(2.5)); + Kokkos::deep_copy(C,Scalar(3.5)); + + Kokkos::Timer timer; + Kokkos::parallel_for("BenchmarkKernel",Kokkos::TeamPolicy<>(N,T).set_scratch_size(0,Kokkos::PerTeam(S)), + KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type& team) { + const int n = team.league_rank(); + for(int r=0; r1) + Scalar a2 = a1*1.3; +#endif +#if(UNROLL>2) + Scalar a3 = a2*1.1; +#endif +#if(UNROLL>3) + Scalar a4 = a3*1.1; +#endif +#if(UNROLL>4) + Scalar a5 = a4*1.3; +#endif +#if(UNROLL>5) + Scalar a6 = a5*1.1; +#endif +#if(UNROLL>6) + Scalar a7 = a6*1.1; +#endif +#if(UNROLL>7) + Scalar a8 = a7*1.1; +#endif + + + for(int f = 0; f1) + a2 += b*a2; +#endif +#if(UNROLL>2) + a3 += b*a3; +#endif +#if(UNROLL>3) + a4 += b*a4; +#endif +#if(UNROLL>4) + a5 += b*a5; +#endif +#if(UNROLL>5) + a6 += b*a6; +#endif +#if(UNROLL>6) + a7 += b*a7; +#endif +#if(UNROLL>7) + a8 += b*a8; +#endif + + + } +#if(UNROLL==1) + C(n,i,0) = a1; +#endif +#if(UNROLL==2) + C(n,i,0) = a1+a2; +#endif +#if(UNROLL==3) + C(n,i,0) = a1+a2+a3; +#endif +#if(UNROLL==4) + C(n,i,0) = a1+a2+a3+a4; +#endif +#if(UNROLL==5) + C(n,i,0) = a1+a2+a3+a4+a5; +#endif +#if(UNROLL==6) + C(n,i,0) = a1+a2+a3+a4+a5+a6; +#endif +#if(UNROLL==7) + C(n,i,0) = a1+a2+a3+a4+a5+a6+a7; +#endif +#if(UNROLL==8) + C(n,i,0) = a1+a2+a3+a4+a5+a6+a7+a8; +#endif + + }); + } + }); + Kokkos::fence(); + double seconds = timer.seconds(); + + double bytes = 1.0*N*K*R*3*sizeof(Scalar); + double flops = 1.0*N*K*R*(F*2*UNROLL + 2*(UNROLL-1)); + printf("NKRUFTS: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: %lf\n",N,K,R,UNROLL,F,T,S,seconds,1.0*bytes/seconds/1024/1024/1024,1.e-9*flops/seconds); +} +}; + diff --git a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp new file mode 100644 index 0000000000..f545247212 --- /dev/null +++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp @@ -0,0 +1,96 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include + +int main(int argc, char* argv[]) { + Kokkos::initialize(); + + + if(argc<10) { + printf("Arguments: N K R D U F T S\n"); + printf(" P: Precision (1==float, 2==double)\n"); + printf(" N,K: dimensions of the 2D array to allocate\n"); + printf(" R: how often to loop through the K dimension with each team\n"); + printf(" D: distance between loaded elements (stride)\n"); + printf(" U: how many independent flops to do per load\n"); + printf(" F: how many times to repeat the U unrolled operations before reading next element\n"); + printf(" T: team size\n"); + printf(" S: shared memory per team (used to control occupancy on GPUs)\n"); + printf("Example Input GPU:\n"); + printf(" Bandwidth Bound : 2 100000 1024 1 1 1 1 256 6000\n"); + printf(" Cache Bound : 2 100000 1024 64 1 1 1 512 20000\n"); + printf(" Compute Bound : 2 100000 1024 1 1 8 64 256 6000\n"); + printf(" Load Slots Used : 2 20000 256 32 16 1 1 256 6000\n"); + printf(" Inefficient Load: 2 20000 256 32 2 1 1 256 20000\n"); + Kokkos::finalize(); + return 0; + } + + + int P = atoi(argv[1]); + int N = atoi(argv[2]); + int K = atoi(argv[3]); + int R = atoi(argv[4]); + int D = atoi(argv[5]); + int U = atoi(argv[6]); + int F = atoi(argv[7]); + int T = atoi(argv[8]); + int S = atoi(argv[9]); + + if(U>8) {printf("U must be 1-8\n"); return 0;} + if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;} + if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;} + + if(P==1) { + run_stride_unroll(N,K,R,D,U,F,T,S); + } + if(P==2) { + run_stride_unroll(N,K,R,D,U,F,T,S); + } + + Kokkos::finalize(); +} + diff --git a/lib/kokkos/benchmarks/gather/Makefile b/lib/kokkos/benchmarks/gather/Makefile new file mode 100644 index 0000000000..fd1feab6fa --- /dev/null +++ b/lib/kokkos/benchmarks/gather/Makefile @@ -0,0 +1,44 @@ +KOKKOS_PATH = ${HOME}/kokkos +SRC = $(wildcard *.cpp) +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/config/nvcc_wrapper +EXE = gather.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +EXE = gather.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +CXXFLAGS = -O3 -g + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(SRC:.cpp=.o) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +$(warning ${KOKKOS_CPPFLAGS}) +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) gather_unroll.hpp gather.hpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< diff --git a/lib/kokkos/benchmarks/gather/gather.hpp b/lib/kokkos/benchmarks/gather/gather.hpp new file mode 100644 index 0000000000..406bd28983 --- /dev/null +++ b/lib/kokkos/benchmarks/gather/gather.hpp @@ -0,0 +1,92 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +template +struct RunGather { + static void run(int N, int K, int D, int R, int F); +}; + +#define UNROLL 1 +#include +#undef UNROLL +#define UNROLL 2 +#include +#undef UNROLL +#define UNROLL 3 +#include +#undef UNROLL +#define UNROLL 4 +#include +#undef UNROLL +#define UNROLL 5 +#include +#undef UNROLL +#define UNROLL 6 +#include +#undef UNROLL +#define UNROLL 7 +#include +#undef UNROLL +#define UNROLL 8 +#include +#undef UNROLL + +template +void run_gather_test(int N, int K, int D, int R, int U, int F) { + if(U == 1) + RunGather::run(N,K,D,R,F); + if(U == 2) + RunGather::run(N,K,D,R,F); + if(U == 3) + RunGather::run(N,K,D,R,F); + if(U == 4) + RunGather::run(N,K,D,R,F); + if(U == 5) + RunGather::run(N,K,D,R,F); + if(U == 6) + RunGather::run(N,K,D,R,F); + if(U == 7) + RunGather::run(N,K,D,R,F); + if(U == 8) + RunGather::run(N,K,D,R,F); +} diff --git a/lib/kokkos/benchmarks/gather/gather_unroll.hpp b/lib/kokkos/benchmarks/gather/gather_unroll.hpp new file mode 100644 index 0000000000..1d01b26ca7 --- /dev/null +++ b/lib/kokkos/benchmarks/gather/gather_unroll.hpp @@ -0,0 +1,169 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +template +struct RunGather { +static void run(int N, int K, int D, int R, int F) { + Kokkos::View connectivity("Connectivity",N,K); + Kokkos::View A_in("Input",N); + Kokkos::View B_in("Input",N); + Kokkos::View C("Output",N); + + Kokkos::Random_XorShift64_Pool<> rand_pool(12313); + + Kokkos::deep_copy(A_in,1.5); + Kokkos::deep_copy(B_in,2.0); + + Kokkos::View > A(A_in); + Kokkos::View > B(B_in); + + Kokkos::parallel_for("InitKernel",N, + KOKKOS_LAMBDA (const int& i) { + auto rand_gen = rand_pool.get_state(); + for( int jj=0; jj1) + Scalar a2 = a1*Scalar(1.3); +#endif +#if(UNROLL>2) + Scalar a3 = a2*Scalar(1.1); +#endif +#if(UNROLL>3) + Scalar a4 = a3*Scalar(1.1); +#endif +#if(UNROLL>4) + Scalar a5 = a4*Scalar(1.3); +#endif +#if(UNROLL>5) + Scalar a6 = a5*Scalar(1.1); +#endif +#if(UNROLL>6) + Scalar a7 = a6*Scalar(1.1); +#endif +#if(UNROLL>7) + Scalar a8 = a7*Scalar(1.1); +#endif + + + for(int f = 0; f1) + a2 += b*a2; +#endif +#if(UNROLL>2) + a3 += b*a3; +#endif +#if(UNROLL>3) + a4 += b*a4; +#endif +#if(UNROLL>4) + a5 += b*a5; +#endif +#if(UNROLL>5) + a6 += b*a6; +#endif +#if(UNROLL>6) + a7 += b*a7; +#endif +#if(UNROLL>7) + a8 += b*a8; +#endif + + + } +#if(UNROLL==1) + c += a1; +#endif +#if(UNROLL==2) + c += a1+a2; +#endif +#if(UNROLL==3) + c += a1+a2+a3; +#endif +#if(UNROLL==4) + c += a1+a2+a3+a4; +#endif +#if(UNROLL==5) + c += a1+a2+a3+a4+a5; +#endif +#if(UNROLL==6) + c += a1+a2+a3+a4+a5+a6; +#endif +#if(UNROLL==7) + c += a1+a2+a3+a4+a5+a6+a7; +#endif +#if(UNROLL==8) + c += a1+a2+a3+a4+a5+a6+a7+a8; +#endif + + } + C(i) = c ; + }); + Kokkos::fence(); + } + double seconds = timer.seconds(); + + double bytes = 1.0*N*K*R*(2*sizeof(Scalar)+sizeof(int)) + 1.0*N*R*sizeof(Scalar); + double flops = 1.0*N*K*R*(F*2*UNROLL + 2*(UNROLL-1)); + double gather_ops = 1.0*N*K*R*2; + printf("SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: %lf GGather/s: %lf\n",sizeof(Scalar)/4,N,K,D,R,UNROLL,F,seconds,1.0*bytes/seconds/1024/1024/1024,1.e-9*flops/seconds,1.e-9*gather_ops/seconds); +} +}; diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWAllocators.cpp b/lib/kokkos/benchmarks/gather/main.cpp similarity index 54% rename from lib/kokkos/core/src/impl/Kokkos_HBWAllocators.cpp rename to lib/kokkos/benchmarks/gather/main.cpp index 4eb80d03f1..161c6f2091 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HBWAllocators.cpp +++ b/lib/kokkos/benchmarks/gather/main.cpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,73 +36,58 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ -#include +#include +#include +#include -#include -#include +int main(int argc, char* argv[]) { + Kokkos::initialize(argc,argv); -#include // uintptr_t -#include // for malloc, realloc, and free -#include // for memcpy - -#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE) -#include // for mmap, munmap, MAP_ANON, etc -#include // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES -#endif - -#include -#include - -#ifdef KOKKOS_HAVE_HBWSPACE -#include - -namespace Kokkos { -namespace Experimental { -namespace Impl { -#define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB) -/*--------------------------------------------------------------------------*/ - -void* HBWMallocAllocator::allocate( size_t size ) -{ - std::cout<< "Allocate HBW: " << 1.0e-6*size << "MB" << std::endl; - void * ptr = NULL; - if (size) { - ptr = memkind_malloc(MEMKIND_TYPE,size); - - if (!ptr) - { - std::ostringstream msg ; - msg << name() << ": allocate(" << size << ") FAILED"; - Kokkos::Impl::throw_runtime_exception( msg.str() ); - } + if(argc<8) { + printf("Arguments: S N K D\n"); + printf(" S: Scalar Type Size (1==float, 2==double, 4=complex)\n"); + printf(" N: Number of entities\n"); + printf(" K: Number of things to gather per entity\n"); + printf(" D: Max distance of gathered things of an entity\n"); + printf(" R: how often to loop through the K dimension with each team\n"); + printf(" U: how many independent flops to do per load\n"); + printf(" F: how many times to repeat the U unrolled operations before reading next element\n"); + printf("Example Input GPU:\n"); + printf(" Bandwidth Bound : 2 10000000 1 1 10 1 1\n"); + printf(" Cache Bound : 2 10000000 64 1 10 1 1\n"); + printf(" Cache Gather : 2 10000000 64 256 10 1 1\n"); + printf(" Global Gather : 2 100000000 16 100000000 1 1 1\n"); + printf(" Typical MD : 2 100000 32 512 1000 8 2\n"); + Kokkos::finalize(); + return 0; } - return ptr; + + + int S = atoi(argv[1]); + int N = atoi(argv[2]); + int K = atoi(argv[3]); + int D = atoi(argv[4]); + int R = atoi(argv[5]); + int U = atoi(argv[6]); + int F = atoi(argv[7]); + + if( (S!=1) && (S!=2) && (S!=4)) {printf("S must be one of 1,2,4\n"); return 0;} + if( N(N,K,D,R,U,F); + } + if(S==2) { + run_gather_test(N,K,D,R,U,F); + } + if(S==4) { + run_gather_test >(N,K,D,R,U,F); + } + Kokkos::finalize(); } -void HBWMallocAllocator::deallocate( void * ptr, size_t /*size*/ ) -{ - if (ptr) { - memkind_free(MEMKIND_TYPE,ptr); - } -} - -void * HBWMallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size) -{ - void * ptr = memkind_realloc(MEMKIND_TYPE, old_ptr, new_size); - - if (new_size > 0u && ptr == NULL) { - Kokkos::Impl::throw_runtime_exception("Error: Malloc Allocator could not reallocate memory"); - } - return ptr; -} - -} // namespace Impl -} // namespace Experimental -} // namespace Kokkos -#endif diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper new file mode 100755 index 0000000000..cb206cf88b --- /dev/null +++ b/lib/kokkos/bin/nvcc_wrapper @@ -0,0 +1,284 @@ +#!/bin/bash +# +# This shell script (nvcc_wrapper) wraps both the host compiler and +# NVCC, if you are building legacy C or C++ code with CUDA enabled. +# The script remedies some differences between the interface of NVCC +# and that of the host compiler, in particular for linking. +# It also means that a legacy code doesn't need separate .cu files; +# it can just use .cpp files. +# +# Default settings: change those according to your machine. For +# example, you may have have two different wrappers with either icpc +# or g++ as their back-end compiler. The defaults can be overwritten +# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc). + +default_arch="sm_35" +#default_arch="sm_50" + +# +# The default C++ compiler. +# +host_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"} +#host_compiler="icpc" +#host_compiler="/usr/local/gcc/4.8.3/bin/g++" +#host_compiler="/usr/local/gcc/4.9.1/bin/g++" + +# +# Internal variables +# + +# C++ files +cpp_files="" + +# Host compiler arguments +xcompiler_args="" + +# Cuda (NVCC) only arguments +cuda_args="" + +# Arguments for both NVCC and Host compiler +shared_args="" + +# Linker arguments +xlinker_args="" + +# Object files passable to NVCC +object_files="" + +# Link objects for the host linker only +object_files_xlinker="" + +# Shared libraries with version numbers are not handled correctly by NVCC +shared_versioned_libraries_host="" +shared_versioned_libraries="" + +# Does the User set the architecture +arch_set=0 + +# Does the user overwrite the host compiler +ccbin_set=0 + +#Error code of compilation +error_code=0 + +# Do a dry run without actually compiling +dry_run=0 + +# Skip NVCC compilation and use host compiler directly +host_only=0 + +# Enable workaround for CUDA 6.5 for pragma ident +replace_pragma_ident=0 + +# Mark first host compiler argument +first_xcompiler_arg=1 + +temp_dir=${TMPDIR:-/tmp} + +# Check if we have an optimization argument already +optimization_applied=0 + +#echo "Arguments: $# $@" + +while [ $# -gt 0 ] +do + case $1 in + #show the executed command + --show|--nvcc-wrapper-show) + dry_run=1 + ;; + #run host compilation only + --host-only) + host_only=1 + ;; + #replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros + --replace-pragma-ident) + replace_pragma_ident=1 + ;; + #handle source files to be compiled as cuda files + *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu) + cpp_files="$cpp_files $1" + ;; + # Ensure we only have one optimization flag because NVCC doesn't allow muliple + -O*) + if [ $optimization_applied -eq 1 ]; then + echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting." + else + shared_args="$shared_args $1" + optimization_applied=1 + fi + ;; + #Handle shared args (valid for both nvcc and the host compiler) + -D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared) + shared_args="$shared_args $1" + ;; + #Handle shared args that have an argument + -o|-MT) + shared_args="$shared_args $1 $2" + shift + ;; + #Handle known nvcc args + -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*) + cuda_args="$cuda_args $1" + ;; + #Handle more known nvcc args + --expt-extended-lambda|--expt-relaxed-constexpr) + cuda_args="$cuda_args $1" + ;; + #Handle known nvcc args that have an argument + -rdc|-maxrregcount|--default-stream) + cuda_args="$cuda_args $1 $2" + shift + ;; + #Handle c++11 setting + --std=c++11|-std=c++11) + shared_args="$shared_args $1" + ;; + #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98 + -std=c++98|--std=c++98) + ;; + #strip of pedantic because it produces endless warnings about #LINE added by the preprocessor + -pedantic|-Wpedantic|-ansi) + ;; + #strip -Xcompiler because we add it + -Xcompiler) + if [ $first_xcompiler_arg -eq 1 ]; then + xcompiler_args="$2" + first_xcompiler_arg=0 + else + xcompiler_args="$xcompiler_args,$2" + fi + shift + ;; + #strip of "-x cu" because we add that + -x) + if [[ $2 != "cu" ]]; then + if [ $first_xcompiler_arg -eq 1 ]; then + xcompiler_args="-x,$2" + first_xcompiler_arg=0 + else + xcompiler_args="$xcompiler_args,-x,$2" + fi + fi + shift + ;; + #Handle -ccbin (if its not set we can set it to a default value) + -ccbin) + cuda_args="$cuda_args $1 $2" + ccbin_set=1 + host_compiler=$2 + shift + ;; + #Handle -arch argument (if its not set use a default + -arch*) + cuda_args="$cuda_args $1" + arch_set=1 + ;; + #Handle -Xcudafe argument + -Xcudafe) + cuda_args="$cuda_args -Xcudafe $2" + shift + ;; + #Handle args that should be sent to the linker + -Wl*) + xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}" + host_linker_args="$host_linker_args ${1:4:${#1}}" + ;; + #Handle object files: -x cu applies to all input files, so give them to linker, except if only linking + *.a|*.so|*.o|*.obj) + object_files="$object_files $1" + object_files_xlinker="$object_files_xlinker -Xlinker $1" + ;; + #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking + *.dylib) + object_files="$object_files -Xlinker $1" + object_files_xlinker="$object_files_xlinker -Xlinker $1" + ;; + #Handle shared libraries with *.so.* names which nvcc can't do. + *.so.*) + shared_versioned_libraries_host="$shared_versioned_libraries_host $1" + shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1" + ;; + #All other args are sent to the host compiler + *) + if [ $first_xcompiler_arg -eq 1 ]; then + xcompiler_args=$1 + first_xcompiler_arg=0 + else + xcompiler_args="$xcompiler_args,$1" + fi + ;; + esac + + shift +done + +#Add default host compiler if necessary +if [ $ccbin_set -ne 1 ]; then + cuda_args="$cuda_args -ccbin $host_compiler" +fi + +#Add architecture command +if [ $arch_set -ne 1 ]; then + cuda_args="$cuda_args -arch=$default_arch" +fi + +#Compose compilation command +nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries" +if [ $first_xcompiler_arg -eq 0 ]; then + nvcc_command="$nvcc_command -Xcompiler $xcompiler_args" +fi + +#Compose host only command +host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host" + +#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING' +if [ $replace_pragma_ident -eq 1 ]; then + cpp_files2="" + for file in $cpp_files + do + var=`grep pragma ${file} | grep ident | grep "#"` + if [ "${#var}" -gt 0 ] + then + sed 's/#[\ \t]*pragma[\ \t]*ident/#ident/g' $file > $temp_dir/nvcc_wrapper_tmp_$file + cpp_files2="$cpp_files2 $temp_dir/nvcc_wrapper_tmp_$file" + else + cpp_files2="$cpp_files2 $file" + fi + done + cpp_files=$cpp_files2 + #echo $cpp_files +fi + +if [ "$cpp_files" ]; then + nvcc_command="$nvcc_command $object_files_xlinker -x cu $cpp_files" +else + nvcc_command="$nvcc_command $object_files" +fi + +if [ "$cpp_files" ]; then + host_command="$host_command $object_files $cpp_files" +else + host_command="$host_command $object_files" +fi + +#Print command for dryrun +if [ $dry_run -eq 1 ]; then + if [ $host_only -eq 1 ]; then + echo $host_command + else + echo $nvcc_command + fi + exit 0 +fi + +#Run compilation command +if [ $host_only -eq 1 ]; then + $host_command +else + $nvcc_command +fi +error_code=$? + +#Report error code +exit $error_code diff --git a/lib/kokkos/cmake/deps/CUSPARSE.cmake b/lib/kokkos/cmake/deps/CUSPARSE.cmake index 205f5e2a98..6f26d857c0 100644 --- a/lib/kokkos/cmake/deps/CUSPARSE.cmake +++ b/lib/kokkos/cmake/deps/CUSPARSE.cmake @@ -53,12 +53,12 @@ # ************************************************************************ # @HEADER -include(${TRIBITS_DEPS_DIR}/CUDA.cmake) +#include(${TRIBITS_DEPS_DIR}/CUDA.cmake) -IF (TPL_ENABLE_CUDA) - GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) - GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) - TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) -ENDIF() +#IF (TPL_ENABLE_CUDA) +# GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) +# GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) +# GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) +# TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) +#ENDIF() diff --git a/lib/kokkos/cmake/tribits.cmake b/lib/kokkos/cmake/tribits.cmake index 34cd216f81..879d801720 100644 --- a/lib/kokkos/cmake/tribits.cmake +++ b/lib/kokkos/cmake/tribits.cmake @@ -1,6 +1,16 @@ INCLUDE(CMakeParseArguments) INCLUDE(CTest) +cmake_policy(SET CMP0054 NEW) + +IF(NOT DEFINED ${PROJECT_NAME}) + project(Kokkos) +ENDIF() + +IF(NOT DEFINED ${${PROJECT_NAME}_ENABLE_DEBUG}}) + SET(${PROJECT_NAME}_ENABLE_DEBUG OFF) +ENDIF() + FUNCTION(ASSERT_DEFINED VARS) FOREACH(VAR ${VARS}) IF(NOT DEFINED ${VAR}) @@ -75,6 +85,13 @@ MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES) ENDMACRO() + +function(INCLUDE_DIRECTORIES) + cmake_parse_arguments(INCLUDE_DIRECTORIES "REQUIRED_DURING_INSTALLATION_TESTING" "" "" ${ARGN}) + _INCLUDE_DIRECTORIES(${INCLUDE_DIRECTORIES_UNPARSED_ARGUMENTS}) +endfunction() + + MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT) SET(PROP_VALUES) FOREACH(TARGET_X ${ARGN}) @@ -271,6 +288,11 @@ ENDFUNCTION() ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR}) +FUNCTION(TRIBITS_ADD_TEST) +ENDFUNCTION() +FUNCTION(TRIBITS_TPL_TENTATIVELY_ENABLE) +ENDFUNCTION() + FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME) SET(options STANDARD_PASS_OUTPUT WILL_FAIL) diff --git a/lib/kokkos/config/configure_compton_cpu.sh b/lib/kokkos/config/configure_compton_cpu.sh old mode 100755 new mode 100644 diff --git a/lib/kokkos/config/configure_compton_mic.sh b/lib/kokkos/config/configure_compton_mic.sh old mode 100755 new mode 100644 diff --git a/lib/kokkos/config/configure_kokkos.sh b/lib/kokkos/config/configure_kokkos.sh old mode 100755 new mode 100644 diff --git a/lib/kokkos/config/configure_kokkos_nvidia.sh b/lib/kokkos/config/configure_kokkos_nvidia.sh old mode 100755 new mode 100644 diff --git a/lib/kokkos/config/configure_shannon.sh b/lib/kokkos/config/configure_shannon.sh old mode 100755 new mode 100644 diff --git a/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt b/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt index 9f56f2fd48..961e4186ec 100644 --- a/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt +++ b/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt @@ -91,9 +91,20 @@ Step 3: // -------------------------------------------------------------------------------- // -Step 4: - 4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github +Step 4: Once all Trilinos tests pass promote Kokkos develop branch to master on Github + 4.1. Generate Changelog (You need a github API token) + + Close all Open issues with "InDevelop" tag on github + + (Not from kokkos directory) + gitthub_changelog_generator kokkos/kokkos --token TOKEN --no-pull-requests --include-labels 'InDevelop' --enhancement-labels 'enhancement,Feature Request' --future-release 'NEWTAG' --between-tags 'NEWTAG,OLDTAG' + + (Copy the new section from the generated CHANGELOG.md to the kokkos/CHANGELOG.md) + (Make desired changes to CHANGELOG.md to enhance clarity) + (Commit and push the CHANGELOG to develop) + 4.2 Merge develop into Master + - DO NOT fast-forward the merge!!!! (From kokkos directory): @@ -103,7 +114,7 @@ Step 4: git reset --hard origin/master git merge --no-ff origin/develop - 4.2. Update the tag in kokkos/config/master_history.txt + 4.3. Update the tag in kokkos/config/master_history.txt Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate Tag format: #.#.## diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt index f2eb674578..78c512ccea 100644 --- a/lib/kokkos/config/master_history.txt +++ b/lib/kokkos/config/master_history.txt @@ -1,3 +1,6 @@ tag: 2.01.00 date: 07:21:2016 master: xxxxxxxx develop: fa6dfcc4 tag: 2.01.06 date: 09:02:2016 master: 9afaa87f develop: 555f1a3a - +tag: 2.01.10 date: 09:27:2016 master: e4119325 develop: e6cda11e +tag: 2.02.00 date: 10:30:2016 master: 6c90a581 develop: ca3dd56e +tag: 2.02.01 date: 11:01:2016 master: 9c698c86 develop: b0072304 +tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966 diff --git a/lib/kokkos/config/nvcc_wrapper b/lib/kokkos/config/nvcc_wrapper index 6093cb61bd..cb206cf88b 100755 --- a/lib/kokkos/config/nvcc_wrapper +++ b/lib/kokkos/config/nvcc_wrapper @@ -121,6 +121,10 @@ do -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*) cuda_args="$cuda_args $1" ;; + #Handle more known nvcc args + --expt-extended-lambda|--expt-relaxed-constexpr) + cuda_args="$cuda_args $1" + ;; #Handle known nvcc args that have an argument -rdc|-maxrregcount|--default-stream) cuda_args="$cuda_args $1 $2" diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia index aac036a8f3..21b8bbff65 100755 --- a/lib/kokkos/config/test_all_sandia +++ b/lib/kokkos/config/test_all_sandia @@ -16,6 +16,8 @@ elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then MACHINE=bowman elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name MACHINE=shepard +elif [[ "$HOSTNAME" =~ apollo ]]; then + MACHINE=apollo elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then MACHINE=sems else @@ -28,6 +30,7 @@ IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" +CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" @@ -44,102 +47,12 @@ BUILD_ONLY=False declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3 TEST_SCRIPT=False SKIP_HWLOC=False +SPOT_CHECK=False -ARCH_FLAG="" +PRINT_HELP=False +OPT_FLAG="" +KOKKOS_OPTIONS="" -# -# Machine specific config -# - -if [ "$MACHINE" = "sems" ]; then - source /projects/modulefiles/utils/sems-modules-init.sh - source /projects/modulefiles/utils/kokkos-modules-init.sh - - BASE_MODULE_LIST="//base,hwloc/1.10.1///base" - CUDA_MODULE_LIST="/,gcc/4.7.2/base" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - -elif [ "$MACHINE" = "white" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/" - IBM_MODULE_LIST="/xl/" - CUDA_MODULE_LIST="/,gcc/4.9.2" - - # Don't do pthread on white - GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" - ) - - ARCH_FLAG="--arch=Power8" - NUM_JOBS_TO_RUN_IN_PARALLEL=8 - -elif [ "$MACHINE" = "bowman" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/compilers/" - - OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ) - - ARCH_FLAG="--arch=KNL" - NUM_JOBS_TO_RUN_IN_PARALLEL=8 - -elif [ "$MACHINE" = "shepard" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/compilers/" - - OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ) - - ARCH_FLAG="--arch=HSW" - NUM_JOBS_TO_RUN_IN_PARALLEL=8 - -else - echo "Unhandled machine $MACHINE" >&2 - exit 1 -fi - -export OMP_NUM_THREADS=4 - -declare -i NUM_RESULTS_TO_KEEP=7 - -RESULT_ROOT_PREFIX=TestAll - -SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd ) # # Handle arguments @@ -173,7 +86,211 @@ NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" --dry-run*) DRYRUN=True ;; ---help) +--spot-check*) +SPOT_CHECK=True +;; +--arch*) +ARCH_FLAG="--arch=${key#*=}" +;; +--opt-flag*) +OPT_FLAG="${key#*=}" +;; +--with-cuda-options*) +KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" +;; +--help*) +PRINT_HELP=True +;; +*) +# args, just append +ARGS="$ARGS $1" +;; +esac +shift +done + +SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd ) + +# set kokkos path +if [ -z "$KOKKOS_PATH" ]; then + KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT +else + # Ensure KOKKOS_PATH is abs path + KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) +fi + +# +# Machine specific config +# + +if [ "$MACHINE" = "sems" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + + BASE_MODULE_LIST="sems-env,kokkos-env,sems-/,kokkos-hwloc/1.10.1/base" + CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" + CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="" + fi + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + "gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi + +elif [ "$MACHINE" = "white" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="/" + IBM_MODULE_LIST="/xl/" + CUDA_MODULE_LIST="/,gcc/5.4.0" + + # Don't do pthread on white + GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=Power8,Kepler37" + fi + NUM_JOBS_TO_RUN_IN_PARALLEL=2 + +elif [ "$MACHINE" = "bowman" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="/compilers/" + + OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ) + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=KNL" + fi + NUM_JOBS_TO_RUN_IN_PARALLEL=2 + +elif [ "$MACHINE" = "shepard" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="/compilers/" + + OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ) + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=HSW" + fi + NUM_JOBS_TO_RUN_IN_PARALLEL=2 + +elif [ "$MACHINE" = "apollo" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + module use /home/projects/modulefiles/local/x86-64 + module load kokkos-env + + module load sems-git + module load sems-tex + module load sems-cmake/3.5.2 + module load sems-gdb + + SKIP_HWLOC=True + + BASE_MODULE_LIST="sems-env,kokkos-env,sems-/,kokkos-hwloc/1.10.1/base" + CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" + CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" + + CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,cuda/8.0.44" + NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,sems-gcc/5.3.0" + + BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" + BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" + BUILD_LIST_CLANG="Serial,Pthread,OpenMP" + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + "gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS" + "cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + "clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" + "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" + "gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=SNB,Kepler35" + fi + NUM_JOBS_TO_RUN_IN_PARALLEL=2 +else + echo "Unhandled machine $MACHINE" >&2 + exit 1 +fi + + + +export OMP_NUM_THREADS=4 + +declare -i NUM_RESULTS_TO_KEEP=7 + +RESULT_ROOT_PREFIX=TestAll + +if [ "$PRINT_HELP" = "True" ]; then echo "test_all_sandia :" echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" echo " Defaults to root repo containing this script" @@ -183,6 +300,9 @@ echo "--skip-hwloc: Do not do hwloc tests" echo "--num=N: Number of jobs to run in parallel " echo "--dry-run: Just print what would be executed" echo "--build-only: Just do builds, don't run anything" +echo "--opt-flag=FLAG: Optimization flag (default: -O3)" +echo "--arch=ARCHITECTURE: overwrite architecture flags" +echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" echo "--build-list=BUILD,BUILD,BUILD..." echo " Provide a comma-separated list of builds instead of running all builds" echo " Valid items:" @@ -220,21 +340,6 @@ echo " hit ctrl-z" echo " % kill -9 %1" echo exit 0 -;; -*) -# args, just append -ARGS="$ARGS $1" -;; -esac -shift -done - -# set kokkos path -if [ -z "$KOKKOS_PATH" ]; then - KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT -else - # Ensure KOKKOS_PATH is abs path - KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) fi # set build type @@ -381,11 +486,15 @@ single_build_and_test() { local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info))) fi + if [[ "$OPT_FLAG" = "" ]]; then + OPT_FLAG="-O3" + fi + if [[ "$build_type" = *debug* ]]; then local extra_args="$extra_args --debug" local cxxflags="-g $compiler_warning_flags" else - local cxxflags="-O3 $compiler_warning_flags" + local cxxflags="$OPT_FLAG $compiler_warning_flags" fi if [[ "$compiler" == cuda* ]]; then @@ -393,7 +502,9 @@ single_build_and_test() { export TMPDIR=$(pwd) fi - # cxxflags="-DKOKKOS_USING_EXP_VIEW=1 $cxxflags" + if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then + local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" + fi echo " Starting job $desc" @@ -440,13 +551,14 @@ run_in_background() { local compiler=$1 local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL - if [[ "$BUILD_ONLY" == True ]]; then - num_jobs=8 - else + # don't override command line input + # if [[ "$BUILD_ONLY" == True ]]; then + # num_jobs=8 + # else if [[ "$compiler" == cuda* ]]; then num_jobs=1 fi - fi + # fi wait_for_jobs $num_jobs single_build_and_test $* & diff --git a/lib/kokkos/config/trilinos-integration/prepare_trilinos_repos.sh b/lib/kokkos/config/trilinos-integration/prepare_trilinos_repos.sh new file mode 100755 index 0000000000..d2a7a533d5 --- /dev/null +++ b/lib/kokkos/config/trilinos-integration/prepare_trilinos_repos.sh @@ -0,0 +1,50 @@ +#!/bin/bash -le + +export TRILINOS_UPDATED_PATH=${PWD}/trilinos-update +export TRILINOS_PRISTINE_PATH=${PWD}/trilinos-pristine + +#rm -rf ${KOKKOS_PATH} +#rm -rf ${TRILINOS_UPDATED_PATH} +#rm -rf ${TRILINOS_PRISTINE_PATH} + +#Already done: +if [ ! -d "${TRILINOS_UPDATED_PATH}" ]; then + git clone https://github.com/trilinos/trilinos ${TRILINOS_UPDATED_PATH} +fi +if [ ! -d "${TRILINOS_PRISTINE_PATH}" ]; then + git clone https://github.com/trilinos/trilinos ${TRILINOS_PRISTINE_PATH} +fi + +cd ${TRILINOS_UPDATED_PATH} +git checkout develop +git reset --hard origin/develop +git pull +cd .. + +python kokkos/config/snapshot.py ${KOKKOS_PATH} ${TRILINOS_UPDATED_PATH}/packages + +cd ${TRILINOS_UPDATED_PATH} +echo "" +echo "" +echo "Trilinos State:" +git log --pretty=oneline --since=2.days +SHA=`git log --pretty=oneline --since=2.days | head -n 2 | tail -n 1 | awk '{print $1}'` +cd .. + +cd ${TRILINOS_PRISTINE_PATH} +git status +git log --pretty=oneline --since=2.days +echo "Checkout develop" +git checkout develop +echo "Pull" +git pull +echo "Checkout SHA" +git checkout ${SHA} +cd .. + +cd ${TRILINOS_PRISTINE_PATH} +echo "" +echo "" +echo "Trilinos Pristine State:" +git log --pretty=oneline --since=2.days +cd .. diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt index 726d403452..403ac746f6 100644 --- a/lib/kokkos/containers/performance_tests/CMakeLists.txt +++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt @@ -1,6 +1,6 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) SET(SOURCES diff --git a/lib/kokkos/containers/performance_tests/Makefile b/lib/kokkos/containers/performance_tests/Makefile index e7abaf44ce..fa3bc77701 100644 --- a/lib/kokkos/containers/performance_tests/Makefile +++ b/lib/kokkos/containers/performance_tests/Makefile @@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/containers/performance_tests default: build_all echo "End Build" +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) + CXX = $(KOKKOS_PATH)/config/nvcc_wrapper +else + CXX = g++ +endif + +CXXFLAGS = -O3 +LINK ?= $(CXX) +LDFLAGS ?= -lpthread include $(KOKKOS_PATH)/Makefile.kokkos -ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - CXX = $(NVCC_WRAPPER) - CXXFLAGS ?= -O3 - LINK = $(CXX) - LDFLAGS ?= -lpthread -else - CXX ?= g++ - CXXFLAGS ?= -O3 - LINK ?= $(CXX) - LDFLAGS ?= -lpthread -endif - KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/performance_tests TEST_TARGETS = diff --git a/lib/kokkos/containers/performance_tests/TestCuda.cpp b/lib/kokkos/containers/performance_tests/TestCuda.cpp index 8183adaa60..e7afad905b 100644 --- a/lib/kokkos/containers/performance_tests/TestCuda.cpp +++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp @@ -83,7 +83,7 @@ TEST_F( cuda, dynrankview_perf ) { std::cout << "Cuda" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; - test_dynrankview_op_perf( 4096 ); + test_dynrankview_op_perf( 40960 ); } TEST_F( cuda, global_2_local) diff --git a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp index aab6e6988f..d96a3f7432 100644 --- a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp +++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp @@ -180,8 +180,8 @@ void test_dynrankview_op_perf( const int par_size ) typedef DeviceType execution_space; typedef typename execution_space::size_type size_type; - const size_type dim2 = 900; - const size_type dim3 = 300; + const size_type dim2 = 90; + const size_type dim3 = 30; double elapsed_time_view = 0; double elapsed_time_compview = 0; diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index 1230df4d97..3a0196ee4c 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -261,9 +261,6 @@ public: modified_device (View ("DualView::modified_device")), modified_host (View ("DualView::modified_host")) { -#if ! KOKKOS_USING_EXP_VIEW - Impl::assert_shapes_are_equal (d_view.shape (), h_view.shape ()); -#else if ( int(d_view.rank) != int(h_view.rank) || d_view.dimension_0() != h_view.dimension_0() || d_view.dimension_1() != h_view.dimension_1() || @@ -284,7 +281,6 @@ public: d_view.span() != h_view.span() ) { Kokkos::Impl::throw_runtime_exception("DualView constructed with incompatible views"); } -#endif } //@} @@ -315,13 +311,13 @@ public: template< class Device > KOKKOS_INLINE_FUNCTION const typename Impl::if_c< - Impl::is_same::value, t_dev, t_host>::type& view () const { return Impl::if_c< - Impl::is_same< + std::is_same< typename t_dev::memory_space, typename Device::memory_space>::value, t_dev, @@ -347,13 +343,13 @@ public: /// appropriate template parameter. template void sync( const typename Impl::enable_if< - ( Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value) || - ( Impl::is_same< Device , int>::value) + ( std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value) || + ( std::is_same< Device , int>::value) , int >::type& = 0) { const unsigned int dev = Impl::if_c< - Impl::is_same< + std::is_same< typename t_dev::memory_space, typename Device::memory_space>::value , unsigned int, @@ -370,7 +366,7 @@ public: modified_host() = modified_device() = 0; } } - if(Impl::is_same::value) { + if(std::is_same::value) { t_dev::execution_space::fence(); t_host::execution_space::fence(); } @@ -378,13 +374,13 @@ public: template void sync ( const typename Impl::enable_if< - ( ! Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value ) || - ( Impl::is_same< Device , int>::value) + ( ! std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value ) || + ( std::is_same< Device , int>::value) , int >::type& = 0 ) { const unsigned int dev = Impl::if_c< - Impl::is_same< + std::is_same< typename t_dev::memory_space, typename Device::memory_space>::value, unsigned int, @@ -405,7 +401,7 @@ public: { const unsigned int dev = Impl::if_c< - Impl::is_same< + std::is_same< typename t_dev::memory_space, typename Device::memory_space>::value , unsigned int, @@ -431,7 +427,7 @@ public: void modify () { const unsigned int dev = Impl::if_c< - Impl::is_same< + std::is_same< typename t_dev::memory_space, typename Device::memory_space>::value, unsigned int, @@ -514,11 +510,7 @@ public: //! The allocation size (same as Kokkos::View::capacity). size_t capacity() const { -#if KOKKOS_USING_EXP_VIEW return d_view.span(); -#else - return d_view.capacity(); -#endif } //! Get stride(s) for each dimension. @@ -555,8 +547,6 @@ public: // Partial specializations of Kokkos::subview() for DualView objects. // -#if KOKKOS_USING_EXP_VIEW - namespace Kokkos { namespace Impl { @@ -590,352 +580,6 @@ subview( const DualView & src , Args ... args ) } /* namespace Kokkos */ -#else - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -// -// Partial specializations of Kokkos::subview() for DualView objects. -// - -namespace Kokkos { -namespace Impl { - -template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type - , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type - , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type - > -struct ViewSubview< DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type > - , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type - , SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type > -{ -private: - - typedef DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type > SrcViewType ; - - enum { V0 = Impl::is_same< SubArg0_type , void >::value ? 1 : 0 }; - enum { V1 = Impl::is_same< SubArg1_type , void >::value ? 1 : 0 }; - enum { V2 = Impl::is_same< SubArg2_type , void >::value ? 1 : 0 }; - enum { V3 = Impl::is_same< SubArg3_type , void >::value ? 1 : 0 }; - enum { V4 = Impl::is_same< SubArg4_type , void >::value ? 1 : 0 }; - enum { V5 = Impl::is_same< SubArg5_type , void >::value ? 1 : 0 }; - enum { V6 = Impl::is_same< SubArg6_type , void >::value ? 1 : 0 }; - enum { V7 = Impl::is_same< SubArg7_type , void >::value ? 1 : 0 }; - - // The source view rank must be equal to the input argument rank - // Once a void argument is encountered all subsequent arguments must be void. - enum { InputRank = - Impl::StaticAssert<( SrcViewType::rank == - ( V0 ? 0 : ( - V1 ? 1 : ( - V2 ? 2 : ( - V3 ? 3 : ( - V4 ? 4 : ( - V5 ? 5 : ( - V6 ? 6 : ( - V7 ? 7 : 8 ))))))) )) - && - ( SrcViewType::rank == - ( 8 - ( V0 + V1 + V2 + V3 + V4 + V5 + V6 + V7 ) ) ) - >::value ? SrcViewType::rank : 0 }; - - enum { R0 = Impl::ViewOffsetRange< SubArg0_type >::is_range ? 1 : 0 }; - enum { R1 = Impl::ViewOffsetRange< SubArg1_type >::is_range ? 1 : 0 }; - enum { R2 = Impl::ViewOffsetRange< SubArg2_type >::is_range ? 1 : 0 }; - enum { R3 = Impl::ViewOffsetRange< SubArg3_type >::is_range ? 1 : 0 }; - enum { R4 = Impl::ViewOffsetRange< SubArg4_type >::is_range ? 1 : 0 }; - enum { R5 = Impl::ViewOffsetRange< SubArg5_type >::is_range ? 1 : 0 }; - enum { R6 = Impl::ViewOffsetRange< SubArg6_type >::is_range ? 1 : 0 }; - enum { R7 = Impl::ViewOffsetRange< SubArg7_type >::is_range ? 1 : 0 }; - - enum { OutputRank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) - + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) }; - - // Reverse - enum { R0_rev = 0 == InputRank ? 0u : ( - 1 == InputRank ? unsigned(R0) : ( - 2 == InputRank ? unsigned(R1) : ( - 3 == InputRank ? unsigned(R2) : ( - 4 == InputRank ? unsigned(R3) : ( - 5 == InputRank ? unsigned(R4) : ( - 6 == InputRank ? unsigned(R5) : ( - 7 == InputRank ? unsigned(R6) : unsigned(R7) ))))))) }; - - typedef typename SrcViewType::array_layout SrcViewLayout ; - - // Choose array layout, attempting to preserve original layout if at all possible. - typedef typename Impl::if_c< - ( // Same Layout IF - // OutputRank 0 - ( OutputRank == 0 ) - || - // OutputRank 1 or 2, InputLayout Left, Interval 0 - // because single stride one or second index has a stride. - ( OutputRank <= 2 && R0 && Impl::is_same::value ) - || - // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] - // because single stride one or second index has a stride. - ( OutputRank <= 2 && R0_rev && Impl::is_same::value ) - ), SrcViewLayout , Kokkos::LayoutStride >::type OutputViewLayout ; - - // Choose data type as a purely dynamic rank array to accomodate a runtime range. - typedef typename Impl::if_c< OutputRank == 0 , typename SrcViewType::value_type , - typename Impl::if_c< OutputRank == 1 , typename SrcViewType::value_type *, - typename Impl::if_c< OutputRank == 2 , typename SrcViewType::value_type **, - typename Impl::if_c< OutputRank == 3 , typename SrcViewType::value_type ***, - typename Impl::if_c< OutputRank == 4 , typename SrcViewType::value_type ****, - typename Impl::if_c< OutputRank == 5 , typename SrcViewType::value_type *****, - typename Impl::if_c< OutputRank == 6 , typename SrcViewType::value_type ******, - typename Impl::if_c< OutputRank == 7 , typename SrcViewType::value_type *******, - typename SrcViewType::value_type ******** - >::type >::type >::type >::type >::type >::type >::type >::type OutputData ; - - // Choose space. - // If the source view's template arg1 or arg2 is a space then use it, - // otherwise use the source view's execution space. - - typedef typename Impl::if_c< Impl::is_space< SrcArg1Type >::value , SrcArg1Type , - typename Impl::if_c< Impl::is_space< SrcArg2Type >::value , SrcArg2Type , typename SrcViewType::execution_space - >::type >::type OutputSpace ; - -public: - - // If keeping the layout then match non-data type arguments - // else keep execution space and memory traits. - typedef typename - Impl::if_c< Impl::is_same< SrcViewLayout , OutputViewLayout >::value - , Kokkos::DualView< OutputData , SrcArg1Type , SrcArg2Type , SrcArg3Type > - , Kokkos::DualView< OutputData , OutputViewLayout , OutputSpace - , typename SrcViewType::memory_traits > - >::type type ; -}; - -} /* namespace Impl */ -} /* namespace Kokkos */ - -namespace Kokkos { - -template< class D , class A1 , class A2 , class A3 , - class ArgType0 > -typename Impl::ViewSubview< DualView - , ArgType0 , void , void , void - , void , void , void , void - >::type -subview( const DualView & src , - const ArgType0 & arg0 ) -{ - typedef typename - Impl::ViewSubview< DualView - , ArgType0 , void , void , void - , void , void , void , void - >::type - DstViewType ; - DstViewType sub_view; - sub_view.d_view = subview(src.d_view,arg0); - sub_view.h_view = subview(src.h_view,arg0); - sub_view.modified_device = src.modified_device; - sub_view.modified_host = src.modified_host; - return sub_view; -} - - -template< class D , class A1 , class A2 , class A3 , - class ArgType0 , class ArgType1 > -typename Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , void , void - , void , void , void , void - >::type -subview( const DualView & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 ) -{ - typedef typename - Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , void , void - , void , void , void , void - >::type - DstViewType ; - DstViewType sub_view; - sub_view.d_view = subview(src.d_view,arg0,arg1); - sub_view.h_view = subview(src.h_view,arg0,arg1); - sub_view.modified_device = src.modified_device; - sub_view.modified_host = src.modified_host; - return sub_view; -} - -template< class D , class A1 , class A2 , class A3 , - class ArgType0 , class ArgType1 , class ArgType2 > -typename Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , void - , void , void , void , void - >::type -subview( const DualView & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 ) -{ - typedef typename - Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , void - , void , void , void , void - >::type - DstViewType ; - DstViewType sub_view; - sub_view.d_view = subview(src.d_view,arg0,arg1,arg2); - sub_view.h_view = subview(src.h_view,arg0,arg1,arg2); - sub_view.modified_device = src.modified_device; - sub_view.modified_host = src.modified_host; - return sub_view; -} - -template< class D , class A1 , class A2 , class A3 , - class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 > -typename Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , void , void , void , void - >::type -subview( const DualView & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 , - const ArgType3 & arg3 ) -{ - typedef typename - Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , void , void , void , void - >::type - DstViewType ; - DstViewType sub_view; - sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3); - sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3); - sub_view.modified_device = src.modified_device; - sub_view.modified_host = src.modified_host; - return sub_view; -} - -template< class D , class A1 , class A2 , class A3 , - class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , - class ArgType4 > -typename Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , void , void , void - >::type -subview( const DualView & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 , - const ArgType3 & arg3 , - const ArgType4 & arg4 ) -{ - typedef typename - Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , void , void ,void - >::type - DstViewType ; - DstViewType sub_view; - sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4); - sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4); - sub_view.modified_device = src.modified_device; - sub_view.modified_host = src.modified_host; - return sub_view; -} - -template< class D , class A1 , class A2 , class A3 , - class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , - class ArgType4 , class ArgType5 > -typename Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , void , void - >::type -subview( const DualView & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 , - const ArgType3 & arg3 , - const ArgType4 & arg4 , - const ArgType5 & arg5 ) -{ - typedef typename - Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , void , void - >::type - DstViewType ; - DstViewType sub_view; - sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5); - sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5); - sub_view.modified_device = src.modified_device; - sub_view.modified_host = src.modified_host; - return sub_view; -} - -template< class D , class A1 , class A2 , class A3 , - class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , - class ArgType4 , class ArgType5 , class ArgType6 > -typename Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , ArgType6 , void - >::type -subview( const DualView & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 , - const ArgType3 & arg3 , - const ArgType4 & arg4 , - const ArgType5 & arg5 , - const ArgType6 & arg6 ) -{ - typedef typename - Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , ArgType6 , void - >::type - DstViewType ; - DstViewType sub_view; - sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6); - sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6); - sub_view.modified_device = src.modified_device; - sub_view.modified_host = src.modified_host; - return sub_view; -} - -template< class D , class A1 , class A2 , class A3 , - class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 , - class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 > -typename Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , ArgType6 , ArgType7 - >::type -subview( const DualView & src , - const ArgType0 & arg0 , - const ArgType1 & arg1 , - const ArgType2 & arg2 , - const ArgType3 & arg3 , - const ArgType4 & arg4 , - const ArgType5 & arg5 , - const ArgType6 & arg6 , - const ArgType7 & arg7 ) -{ - typedef typename - Impl::ViewSubview< DualView - , ArgType0 , ArgType1 , ArgType2 , ArgType3 - , ArgType4 , ArgType5 , ArgType6 , ArgType7 - >::type - DstViewType ; - DstViewType sub_view; - sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7); - sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7); - sub_view.modified_device = src.modified_device; - sub_view.modified_host = src.modified_host; - return sub_view; -} - -} // namespace Kokkos - -#endif /* KOKKOS_USING_EXP_VIEW */ - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index f72277700a..1ac92b9d17 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -223,14 +223,85 @@ struct DynRankDimTraits { ); } - template < typename DynRankViewType , typename iType > - void verify_dynrankview_rank ( iType N , const DynRankViewType &drv ) - { - if ( static_cast(drv.rank()) > N ) - { - Kokkos::abort( "Need at least rank arguments to the operator()" ); - } + +/** \brief Debug bounds-checking routines */ +// Enhanced debug checking - most infrastructure matches that of functions in +// Kokkos_ViewMapping; additional checks for extra arguments beyond rank are 0 +template< unsigned , typename iType0 , class MapType > +KOKKOS_INLINE_FUNCTION +bool dyn_rank_view_verify_operator_bounds( const iType0 & , const MapType & ) +{ return true ; } + +template< unsigned R , typename iType0 , class MapType , typename iType1 , class ... Args > +KOKKOS_INLINE_FUNCTION +bool dyn_rank_view_verify_operator_bounds + ( const iType0 & rank + , const MapType & map + , const iType1 & i + , Args ... args + ) +{ + if ( static_cast(R) < rank ) { + return ( size_t(i) < map.extent(R) ) + && dyn_rank_view_verify_operator_bounds( rank , map , args ... ); } + else if ( i != 0 ) { + printf("DynRankView Debug Bounds Checking Error: at rank %u\n Extra arguments beyond the rank must be zero \n",R); + return ( false ) + && dyn_rank_view_verify_operator_bounds( rank , map , args ... ); + } + else { + return ( true ) + && dyn_rank_view_verify_operator_bounds( rank , map , args ... ); + } +} + +template< unsigned , class MapType > +inline +void dyn_rank_view_error_operator_bounds( char * , int , const MapType & ) +{} + +template< unsigned R , class MapType , class iType , class ... Args > +inline +void dyn_rank_view_error_operator_bounds + ( char * buf + , int len + , const MapType & map + , const iType & i + , Args ... args + ) +{ + const int n = + snprintf(buf,len," %ld < %ld %c" + , static_cast(i) + , static_cast( map.extent(R) ) + , ( sizeof...(Args) ? ',' : ')' ) + ); + dyn_rank_view_error_operator_bounds(buf+n,len-n,map,args...); +} + +// op_rank = rank of the operator version that was called +template< typename iType0 , typename iType1 , class MapType , class ... Args > +KOKKOS_INLINE_FUNCTION +void dyn_rank_view_verify_operator_bounds + ( const iType0 & op_rank , const iType1 & rank , const char* label , const MapType & map , Args ... args ) +{ + if ( static_cast(rank) > op_rank ) { + Kokkos::abort( "DynRankView Bounds Checking Error: Need at least rank arguments to the operator()" ); + } + + if ( ! dyn_rank_view_verify_operator_bounds<0>( rank , map , args ... ) ) { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + enum { LEN = 1024 }; + char buffer[ LEN ]; + int n = snprintf(buffer,LEN,"DynRankView bounds error of view %s (", label); + dyn_rank_view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... ); + Kokkos::Impl::throw_runtime_exception(std::string(buffer)); +#else + Kokkos::abort("DynRankView bounds error"); +#endif + } +} /** \brief Assign compatible default mappings */ @@ -341,7 +412,6 @@ class DynRankView : public ViewTraits< DataType , Properties ... > private: template < class , class ... > friend class DynRankView ; -// template < class , class ... > friend class Kokkos::Experimental::View ; //unnecessary now... template < class , class ... > friend class Impl::ViewMapping ; public: @@ -504,20 +574,26 @@ private: ( is_layout_left || is_layout_right || is_layout_stride ) }; + template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space + { KOKKOS_FORCEINLINE_FUNCTION static void check() {} }; + + template< class Space > struct verify_space + { KOKKOS_FORCEINLINE_FUNCTION static void check() + { Kokkos::abort("Kokkos::DynRankView ERROR: attempt to access inaccessible memory space"); }; + }; + // Bounds checking macros #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) -#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \ - < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \ - Kokkos::Experimental::Impl::verify_dynrankview_rank ( N , *this ) ; \ - Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ; +// rank of the calling operator - included as first argument in ARG +#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \ + DynRankView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); \ + Kokkos::Experimental::Impl::dyn_rank_view_verify_operator_bounds ARG ; #else -#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \ - < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); +#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \ + DynRankView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); #endif @@ -532,7 +608,11 @@ public: KOKKOS_INLINE_FUNCTION reference_type operator()() const { - KOKKOS_VIEW_OPERATOR_VERIFY( 0 , ( implementation_map() ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (0 , this->rank() , NULL , m_map) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (0 , this->rank() , m_track.template get_label().c_str(),m_map) ) + #endif return implementation_map().reference(); //return m_map.reference(0,0,0,0,0,0,0); } @@ -563,12 +643,17 @@ public: return rankone_view(i0); } + // Rank 1 parenthesis template< typename iType > KOKKOS_INLINE_FUNCTION typename std::enable_if< (std::is_same::value && std::is_integral::value), reference_type>::type operator()(const iType & i0 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 1 , ( m_map , i0 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , NULL , m_map , i0) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , m_track.template get_label().c_str(),m_map,i0) ) + #endif return m_map.reference(i0); } @@ -577,6 +662,11 @@ public: typename std::enable_if< !(std::is_same::value && std::is_integral::value), reference_type>::type operator()(const iType & i0 ) const { + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , NULL , m_map , i0) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (1 , this->rank() , m_track.template get_label().c_str(),m_map,i0) ) + #endif return m_map.reference(i0,0,0,0,0,0,0); } @@ -586,7 +676,11 @@ public: typename std::enable_if< (std::is_same::value && std::is_integral::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , NULL , m_map , i0 , i1) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1) ) + #endif return m_map.reference(i0,i1); } @@ -595,7 +689,11 @@ public: typename std::enable_if< !(std::is_same::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , NULL , m_map , i0 , i1) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (2 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1) ) + #endif return m_map.reference(i0,i1,0,0,0,0,0); } @@ -605,7 +703,11 @@ public: typename std::enable_if< (std::is_same::value && std::is_integral::value && std::is_integral::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , NULL , m_map , i0 , i1 , i2) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1,i2) ) + #endif return m_map.reference(i0,i1,i2); } @@ -614,7 +716,11 @@ public: typename std::enable_if< !(std::is_same::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , NULL , m_map , i0 , i1 , i2) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (3 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1,i2) ) + #endif return m_map.reference(i0,i1,i2,0,0,0,0); } @@ -624,7 +730,11 @@ public: typename std::enable_if< (std::is_same::value && std::is_integral::value && std::is_integral::value && std::is_integral::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1,i2,i3) ) + #endif return m_map.reference(i0,i1,i2,i3); } @@ -633,7 +743,11 @@ public: typename std::enable_if< !(std::is_same::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (4 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1,i2,i3) ) + #endif return m_map.reference(i0,i1,i2,i3,0,0,0); } @@ -643,7 +757,11 @@ public: typename std::enable_if< (std::is_same::value && std::is_integral::value && std::is_integral::value && std::is_integral::value && std::is_integral::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4) ) + #endif return m_map.reference(i0,i1,i2,i3,i4); } @@ -652,7 +770,11 @@ public: typename std::enable_if< !(std::is_same::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (5 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4) ) + #endif return m_map.reference(i0,i1,i2,i3,i4,0,0); } @@ -662,7 +784,11 @@ public: typename std::enable_if< (std::is_same::value && std::is_integral::value && std::is_integral::value && std::is_integral::value && std::is_integral::value && std::is_integral::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4 , i5) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,i5) ) + #endif return m_map.reference(i0,i1,i2,i3,i4,i5); } @@ -671,7 +797,11 @@ public: typename std::enable_if< !(std::is_same::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4 , i5) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (6 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,i5) ) + #endif return m_map.reference(i0,i1,i2,i3,i4,i5,0); } @@ -681,7 +811,11 @@ public: typename std::enable_if< (std::is_integral::value && std::is_integral::value && std::is_integral::value && std::is_integral::value && std::is_integral::value && std::is_integral::value && std::is_integral::value), reference_type>::type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( 7 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 , i6 ) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (7 , this->rank() , NULL , m_map , i0 , i1 , i2 , i3, i4 , i5 , i6) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (7 , this->rank() , m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6) ) + #endif return m_map.reference(i0,i1,i2,i3,i4,i5,i6); } @@ -1136,13 +1270,13 @@ private: public: - typedef Kokkos::Experimental::ViewTraits + typedef Kokkos::ViewTraits < data_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > traits_type ; - typedef Kokkos::Experimental::View + typedef Kokkos::View < data_type , array_layout , typename SrcTraits::device_type @@ -1154,13 +1288,13 @@ public: static_assert( Kokkos::Impl::is_memory_traits< MemoryTraits >::value , "" ); - typedef Kokkos::Experimental::ViewTraits + typedef Kokkos::ViewTraits < data_type , array_layout , typename SrcTraits::device_type , MemoryTraits > traits_type ; - typedef Kokkos::Experimental::View + typedef Kokkos::View < data_type , array_layout , typename SrcTraits::device_type @@ -1264,7 +1398,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args. if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args { Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); } - typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::Experimental::ViewTraits< D*******, P... > , Args... > metafcn ; + typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ; return metafcn::subview( src.rank() , src , args... ); } @@ -1502,10 +1636,10 @@ void deep_copy typedef typename src_type::memory_space src_memory_space ; enum { DstExecCanAccessSrc = - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value }; + Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible }; enum { SrcExecCanAccessDst = - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value }; + Kokkos::Impl::SpaceAccessibility< src_execution_space , dst_memory_space >::accessible }; if ( (void *) dst.data() != (void*) src.data() ) { @@ -1666,7 +1800,7 @@ inline typename DynRankView::HostMirror create_mirror( const DynRankView & src , typename std::enable_if< - ! std::is_same< typename Kokkos::Experimental::ViewTraits::array_layout + ! std::is_same< typename Kokkos::ViewTraits::array_layout , Kokkos::LayoutStride >::value >::type * = 0 ) @@ -1684,7 +1818,7 @@ inline typename DynRankView::HostMirror create_mirror( const DynRankView & src , typename std::enable_if< - std::is_same< typename Kokkos::Experimental::ViewTraits::array_layout + std::is_same< typename Kokkos::ViewTraits::array_layout , Kokkos::LayoutStride >::value >::type * = 0 ) @@ -1779,7 +1913,7 @@ void resize( DynRankView & v , { typedef DynRankView drview_type ; - static_assert( Kokkos::Experimental::ViewTraits::is_managed , "Can only resize managed views" ); + static_assert( Kokkos::ViewTraits::is_managed , "Can only resize managed views" ); drview_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6 ); @@ -1803,7 +1937,7 @@ void realloc( DynRankView & v , { typedef DynRankView drview_type ; - static_assert( Kokkos::Experimental::ViewTraits::is_managed , "Can only realloc managed views" ); + static_assert( Kokkos::ViewTraits::is_managed , "Can only realloc managed views" ); const std::string label = v.label(); diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index fb364f0bf2..3277c007d0 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -56,7 +56,7 @@ namespace Experimental { * Subviews are not allowed. */ template< typename DataType , typename ... P > -class DynamicView : public Kokkos::Experimental::ViewTraits< DataType , P ... > +class DynamicView : public Kokkos::ViewTraits< DataType , P ... > { public: @@ -75,6 +75,15 @@ private: std::is_same< typename traits::specialize , void >::value , "DynamicView must have trivial data type" ); + + template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space + { KOKKOS_FORCEINLINE_FUNCTION static void check() {} }; + + template< class Space > struct verify_space + { KOKKOS_FORCEINLINE_FUNCTION static void check() + { Kokkos::abort("Kokkos::DynamicView ERROR: attempt to access inaccessible memory space"); }; + }; + public: typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ; @@ -117,10 +126,10 @@ public: KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace + Kokkos::Impl::MemorySpaceAccess < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space - >::value + >::accessible ? // Runtime size is at the end of the chunk pointer array (*reinterpret_cast( m_chunks + m_chunk_max )) << m_chunk_shift @@ -179,10 +188,7 @@ public: static_assert( Kokkos::Impl::are_integral::value , "Indices must be integral type" ); - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace - < Kokkos::Impl::ActiveExecutionMemorySpace - , typename traits::memory_space - >::verify(); + DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); // Which chunk is being indexed. const uintptr_t ic = uintptr_t( i0 >> m_chunk_shift ); @@ -223,15 +229,13 @@ public: { typedef typename traits::value_type value_type ; - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace - < Kokkos::Impl::ActiveExecutionMemorySpace - , typename traits::memory_space >::verify(); + DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ; if ( m_chunk_max < NC ) { #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) - printf("DynamicView::resize_parallel(%lu) m_chunk_max(%lu) NC(%lu)\n" + printf("DynamicView::resize_parallel(%lu) m_chunk_max(%u) NC(%lu)\n" , n , m_chunk_max , NC ); #endif Kokkos::abort("DynamicView::resize_parallel exceeded maximum size"); @@ -269,9 +273,7 @@ public: inline void resize_serial( size_t n ) { - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace - < Kokkos::Impl::ActiveExecutionMemorySpace - , typename traits::memory_space >::verify(); + DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ; @@ -398,9 +400,7 @@ public: , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 ) , m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift ) { - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace - < Kokkos::Impl::ActiveExecutionMemorySpace - , typename traits::memory_space >::verify(); + DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); // A functor to deallocate all of the chunks upon final destruction @@ -452,7 +452,7 @@ void deep_copy( const View & dst typedef typename ViewTraits::memory_space src_memory_space ; enum { DstExecCanAccessSrc = - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value }; + Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible }; if ( DstExecCanAccessSrc ) { // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. @@ -476,7 +476,7 @@ void deep_copy( const DynamicView & dst typedef typename ViewTraits::memory_space src_memory_space ; enum { DstExecCanAccessSrc = - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value }; + Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible }; if ( DstExecCanAccessSrc ) { // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape. diff --git a/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp b/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp new file mode 100644 index 0000000000..4c90e4c238 --- /dev/null +++ b/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp @@ -0,0 +1,196 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP +#define KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP + +#include +#include +#include +#include + +namespace Kokkos { +namespace Experimental { + +template +class ErrorReporter +{ +public: + + typedef ReportType report_type; + typedef DeviceType device_type; + typedef typename device_type::execution_space execution_space; + + ErrorReporter(int max_results) + : m_numReportsAttempted(""), + m_reports("", max_results), + m_reporters("", max_results) + { + clear(); + } + + int getCapacity() const { return m_reports.h_view.dimension_0(); } + + int getNumReports(); + + int getNumReportAttempts(); + + void getReports(std::vector &reporters_out, std::vector &reports_out); + void getReports( typename Kokkos::View::HostMirror &reporters_out, + typename Kokkos::View::HostMirror &reports_out); + + void clear(); + + void resize(const size_t new_size); + + bool full() {return (getNumReportAttempts() >= getCapacity()); } + + KOKKOS_INLINE_FUNCTION + bool add_report(int reporter_id, report_type report) const + { + int idx = Kokkos::atomic_fetch_add(&m_numReportsAttempted(), 1); + + if (idx >= 0 && (idx < static_cast(m_reports.d_view.dimension_0()))) { + m_reporters.d_view(idx) = reporter_id; + m_reports.d_view(idx) = report; + return true; + } + else { + return false; + } + } + +private: + + typedef Kokkos::View reports_view_t; + typedef Kokkos::DualView reports_dualview_t; + + typedef typename reports_dualview_t::host_mirror_space host_mirror_space; + Kokkos::View m_numReportsAttempted; + reports_dualview_t m_reports; + Kokkos::DualView m_reporters; + +}; + + +template +inline int ErrorReporter::getNumReports() +{ + int num_reports = 0; + Kokkos::deep_copy(num_reports,m_numReportsAttempted); + if (num_reports > static_cast(m_reports.h_view.dimension_0())) { + num_reports = m_reports.h_view.dimension_0(); + } + return num_reports; +} + +template +inline int ErrorReporter::getNumReportAttempts() +{ + int num_reports = 0; + Kokkos::deep_copy(num_reports,m_numReportsAttempted); + return num_reports; +} + +template +void ErrorReporter::getReports(std::vector &reporters_out, std::vector &reports_out) +{ + int num_reports = getNumReports(); + reporters_out.clear(); + reporters_out.reserve(num_reports); + reports_out.clear(); + reports_out.reserve(num_reports); + + if (num_reports > 0) { + m_reports.template sync(); + m_reporters.template sync(); + + for (int i = 0; i < num_reports; ++i) { + reporters_out.push_back(m_reporters.h_view(i)); + reports_out.push_back(m_reports.h_view(i)); + } + } +} + +template +void ErrorReporter::getReports( + typename Kokkos::View::HostMirror &reporters_out, + typename Kokkos::View::HostMirror &reports_out) +{ + int num_reports = getNumReports(); + reporters_out = typename Kokkos::View::HostMirror("ErrorReport::reporters_out",num_reports); + reports_out = typename Kokkos::View::HostMirror("ErrorReport::reports_out",num_reports); + + if (num_reports > 0) { + m_reports.template sync(); + m_reporters.template sync(); + + for (int i = 0; i < num_reports; ++i) { + reporters_out(i) = m_reporters.h_view(i); + reports_out(i) = m_reports.h_view(i); + } + } +} + +template +void ErrorReporter::clear() +{ + int num_reports=0; + Kokkos::deep_copy(m_numReportsAttempted, num_reports); + m_reports.template modify(); + m_reporters.template modify(); +} + +template +void ErrorReporter::resize(const size_t new_size) +{ + m_reports.resize(new_size); + m_reporters.resize(new_size); + Kokkos::fence(); +} + + +} // namespace Experimental +} // namespace kokkos + +#endif diff --git a/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp b/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp deleted file mode 100644 index 5dd7a98b89..0000000000 --- a/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp +++ /dev/null @@ -1,531 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_SEGMENTED_VIEW_HPP_ -#define KOKKOS_SEGMENTED_VIEW_HPP_ - -#include -#include -#include - -#if ! KOKKOS_USING_EXP_VIEW - -namespace Kokkos { -namespace Experimental { - -namespace Impl { - -template -struct delete_segmented_view; - -template -inline -void DeviceSetAllocatableMemorySize(size_t) {} - -#if defined( KOKKOS_HAVE_CUDA ) - -template<> -inline -void DeviceSetAllocatableMemorySize(size_t size) { -#ifdef __CUDACC__ - size_t size_limit; - cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize); - if(size_limit -inline -void DeviceSetAllocatableMemorySize(size_t size) { -#ifdef __CUDACC__ - size_t size_limit; - cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize); - if(size_limit -class SegmentedView : public Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > -{ -public: - //! \name Typedefs for device types and various Kokkos::View specializations. - //@{ - typedef Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ; - - //! The type of a Kokkos::View on the device. - typedef Kokkos::View< typename traits::data_type , - typename traits::array_layout , - typename traits::memory_space , - Kokkos::MemoryUnmanaged > t_dev ; - - -private: - Kokkos::View segments_; - - Kokkos::View realloc_lock; - Kokkos::View nsegments_; - - size_t segment_length_; - size_t segment_length_m1_; - int max_segments_; - - int segment_length_log2; - - // Dimensions, cardinality, capacity, and offset computation for - // multidimensional array view of contiguous memory. - // Inherits from Impl::Shape - typedef Kokkos::Impl::ViewOffset< typename traits::shape_type - , typename traits::array_layout - > offset_map_type ; - - offset_map_type m_offset_map ; - - typedef Kokkos::View< typename traits::array_intrinsic_type , - typename traits::array_layout , - typename traits::memory_space , - typename traits::memory_traits > array_type ; - - typedef Kokkos::View< typename traits::const_data_type , - typename traits::array_layout , - typename traits::memory_space , - typename traits::memory_traits > const_type ; - - typedef Kokkos::View< typename traits::non_const_data_type , - typename traits::array_layout , - typename traits::memory_space , - typename traits::memory_traits > non_const_type ; - - typedef Kokkos::View< typename traits::non_const_data_type , - typename traits::array_layout , - HostSpace , - void > HostMirror ; - - template< bool Accessible > - KOKKOS_INLINE_FUNCTION - typename Kokkos::Impl::enable_if< Accessible , typename traits::size_type >::type - dimension_0_intern() const { return nsegments_() * segment_length_ ; } - - template< bool Accessible > - KOKKOS_INLINE_FUNCTION - typename Kokkos::Impl::enable_if< ! Accessible , typename traits::size_type >::type - dimension_0_intern() const - { - // In Host space - int n = 0 ; -#if ! defined( __CUDA_ARCH__ ) - Kokkos::Impl::DeepCopy< HostSpace , typename traits::memory_space >( & n , nsegments_.ptr_on_device() , sizeof(int) ); -#endif - - return n * segment_length_ ; - } - -public: - - enum { Rank = traits::rank }; - - KOKKOS_INLINE_FUNCTION offset_map_type shape() const { return m_offset_map ; } - - /* \brief return (current) size of dimension 0 */ - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { - enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value }; - int n = SegmentedView::dimension_0_intern< Accessible >(); - return n ; - } - - /* \brief return size of dimension 1 */ - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; } - /* \brief return size of dimension 2 */ - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; } - /* \brief return size of dimension 3 */ - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; } - /* \brief return size of dimension 4 */ - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; } - /* \brief return size of dimension 5 */ - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; } - /* \brief return size of dimension 6 */ - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; } - /* \brief return size of dimension 7 */ - KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; } - - /* \brief return size of dimension 2 */ - KOKKOS_INLINE_FUNCTION typename traits::size_type size() const { - return dimension_0() * - m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 * - m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7 ; - } - - template< typename iType > - KOKKOS_INLINE_FUNCTION - typename traits::size_type dimension( const iType & i ) const { - if(i==0) - return dimension_0(); - else - return Kokkos::Impl::dimension( m_offset_map , i ); - } - - KOKKOS_INLINE_FUNCTION - typename traits::size_type capacity() { - return segments_.dimension_0() * - m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 * - m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7; - } - - KOKKOS_INLINE_FUNCTION - typename traits::size_type get_num_segments() { - enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value }; - int n = SegmentedView::dimension_0_intern< Accessible >(); - return n/segment_length_ ; - } - - KOKKOS_INLINE_FUNCTION - typename traits::size_type get_max_segments() { - return max_segments_; - } - - /// \brief Constructor that allocates View objects with an initial length of 0. - /// - /// This constructor works mostly like the analogous constructor of View. - /// The first argument is a string label, which is entirely for your - /// benefit. (Different SegmentedView objects may have the same label if - /// you like.) The second argument 'view_length' is the size of the segments. - /// This number must be a power of two. The third argument n0 is the maximum - /// value for the first dimension of the segmented view. The maximal allocatable - /// number of Segments is thus: (n0+view_length-1)/view_length. - /// The arguments that follow are the other dimensions of the (1-7) of the - /// View objects. For example, for a View with 3 runtime dimensions, - /// the first 4 integer arguments will be nonzero: - /// SegmentedView("Name",32768,10000000,8,4). This allocates a SegmentedView - /// with a maximum of 306 segments of dimension (32768,8,4). The logical size of - /// the segmented view is (n,8,4) with n between 0 and 10000000. - /// You may omit the integer arguments that follow. - template< class LabelType > - SegmentedView(const LabelType & label , - const size_t view_length , - const size_t n0 , - const size_t n1 = 0 , - const size_t n2 = 0 , - const size_t n3 = 0 , - const size_t n4 = 0 , - const size_t n5 = 0 , - const size_t n6 = 0 , - const size_t n7 = 0 - ): segment_length_(view_length),segment_length_m1_(view_length-1) - { - segment_length_log2 = -1; - size_t l = segment_length_; - while(l>0) { - l>>=1; - segment_length_log2++; - } - l = 1<(segment_length_*max_segments_*sizeof(typename traits::value_type)); - - segments_ = Kokkos::View(label , max_segments_); - realloc_lock = Kokkos::View("Lock"); - nsegments_ = Kokkos::View("nviews"); - m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n0*n1*n2*n3*n4*n5*n6*n7 ); - - } - - KOKKOS_INLINE_FUNCTION - SegmentedView(const SegmentedView& src): - segments_(src.segments_), - realloc_lock (src.realloc_lock), - nsegments_ (src.nsegments_), - segment_length_(src.segment_length_), - segment_length_m1_(src.segment_length_m1_), - max_segments_ (src.max_segments_), - segment_length_log2(src.segment_length_log2), - m_offset_map (src.m_offset_map) - {} - - KOKKOS_INLINE_FUNCTION - SegmentedView& operator= (const SegmentedView& src) { - segments_ = src.segments_; - realloc_lock = src.realloc_lock; - nsegments_ = src.nsegments_; - segment_length_= src.segment_length_; - segment_length_m1_= src.segment_length_m1_; - max_segments_ = src.max_segments_; - segment_length_log2= src.segment_length_log2; - m_offset_map = src.m_offset_map; - return *this; - } - - ~SegmentedView() { - if ( !segments_.tracker().ref_counting()) { return; } - size_t ref_count = segments_.tracker().ref_count(); - if(ref_count == 1u) { - Kokkos::fence(); - typename Kokkos::View::HostMirror h_nviews("h_nviews"); - Kokkos::deep_copy(h_nviews,nsegments_); - Kokkos::parallel_for(h_nviews(),Impl::delete_segmented_view(*this)); - } - } - - KOKKOS_INLINE_FUNCTION - t_dev get_segment(const int& i) const { - return segments_[i]; - } - - template< class MemberType> - KOKKOS_INLINE_FUNCTION - void grow (MemberType& team_member, const size_t& growSize) const { - if (growSize>max_segments_*segment_length_) { - printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_); - return; - } - - if(team_member.team_rank()==0) { - bool too_small = growSize > segment_length_ * nsegments_(); - if (too_small) { - while(Kokkos::atomic_compare_exchange(&realloc_lock(),0,1) ) - ; // get the lock - too_small = growSize > segment_length_ * nsegments_(); // Recheck once we have the lock - if(too_small) { - while(too_small) { - const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3* - m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7; - typename traits::non_const_value_type* const ptr = new typename traits::non_const_value_type[alloc_size]; - - segments_(nsegments_()) = - t_dev(ptr,segment_length_,m_offset_map.N1,m_offset_map.N2,m_offset_map.N3,m_offset_map.N4,m_offset_map.N5,m_offset_map.N6,m_offset_map.N7); - nsegments_()++; - too_small = growSize > segment_length_ * nsegments_(); - } - } - realloc_lock() = 0; //release the lock - } - } - team_member.team_barrier(); - } - - KOKKOS_INLINE_FUNCTION - void grow_non_thread_safe (const size_t& growSize) const { - if (growSize>max_segments_*segment_length_) { - printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_); - return; - } - bool too_small = growSize > segment_length_ * nsegments_(); - if(too_small) { - while(too_small) { - const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3* - m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7; - typename traits::non_const_value_type* const ptr = - new typename traits::non_const_value_type[alloc_size]; - - segments_(nsegments_()) = - t_dev (ptr, segment_length_, m_offset_map.N1, m_offset_map.N2, - m_offset_map.N3, m_offset_map.N4, m_offset_map.N5, - m_offset_map.N6, m_offset_map.N7); - nsegments_()++; - too_small = growSize > segment_length_ * nsegments_(); - } - } - } - - template< typename iType0 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if<( std::is_integral::value && traits::rank == 1 ) - , typename traits::value_type & - >::type - operator() ( const iType0 & i0 ) const - { - return segments_[i0>>segment_length_log2](i0&(segment_length_m1_)); - } - - template< typename iType0 , typename iType1 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if<( std::is_integral::value && - std::is_integral::value && - traits::rank == 2 ) - , typename traits::value_type & - >::type - operator() ( const iType0 & i0 , const iType1 & i1 ) const - { - return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1); - } - - template< typename iType0 , typename iType1 , typename iType2 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if<( std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - traits::rank == 3 ) - , typename traits::value_type & - >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const - { - return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2); - } - - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if<( std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - traits::rank == 4 ) - , typename traits::value_type & - >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const - { - return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3); - } - - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if<( std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - traits::rank == 5 ) - , typename traits::value_type & - >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 ) const - { - return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4); - } - - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 , typename iType5 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if<( std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - traits::rank == 6 ) - , typename traits::value_type & - >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 , const iType5 & i5 ) const - { - return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5); - } - - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 , typename iType5 , typename iType6 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if<( std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - traits::rank == 7 ) - , typename traits::value_type & - >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const - { - return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6); - } - - template< typename iType0 , typename iType1 , typename iType2 , typename iType3 , - typename iType4 , typename iType5 , typename iType6 , typename iType7 > - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if<( std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - std::is_integral::value && - traits::rank == 8 ) - , typename traits::value_type & - >::type - operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , - const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const - { - return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6,i7); - } -}; - -namespace Impl { -template -struct delete_segmented_view { - typedef SegmentedView view_type; - typedef typename view_type::execution_space execution_space; - - view_type view_; - delete_segmented_view(view_type view):view_(view) { - } - - KOKKOS_INLINE_FUNCTION - void operator() (int i) const { - delete [] view_.get_segment(i).ptr_on_device(); - } -}; - -} -} -} - -#endif - -#endif diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp index 7a916c6ef7..8646d27792 100644 --- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -241,9 +241,9 @@ public: typedef UnorderedMap modifiable_map_type; typedef UnorderedMap const_map_type; - static const bool is_set = Impl::is_same::value; - static const bool has_const_key = Impl::is_same::value; - static const bool has_const_value = is_set || Impl::is_same::value; + static const bool is_set = std::is_same::value; + static const bool has_const_key = std::is_same::value; + static const bool has_const_value = is_set || std::is_same::value; static const bool is_insertable_map = !has_const_key && (is_set || !has_const_value); static const bool is_modifiable_map = has_const_key && !has_const_value; @@ -735,8 +735,8 @@ public: } template - typename Impl::enable_if< Impl::is_same< typename Impl::remove_const::type, key_type>::value && - Impl::is_same< typename Impl::remove_const::type, value_type>::value + typename Impl::enable_if< std::is_same< typename Impl::remove_const::type, key_type>::value && + std::is_same< typename Impl::remove_const::type, value_type>::value >::type create_copy_view( UnorderedMap const& src) { diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt index 7fff0f835b..b9d860f32f 100644 --- a/lib/kokkos/containers/unit_tests/CMakeLists.txt +++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt @@ -1,6 +1,6 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) SET(SOURCES diff --git a/lib/kokkos/containers/unit_tests/Makefile b/lib/kokkos/containers/unit_tests/Makefile index 48e3ff61d0..c45e2be05e 100644 --- a/lib/kokkos/containers/unit_tests/Makefile +++ b/lib/kokkos/containers/unit_tests/Makefile @@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests default: build_all echo "End Build" +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) + CXX = $(KOKKOS_PATH)/config/nvcc_wrapper +else + CXX = g++ +endif + +CXXFLAGS = -O3 +LINK ?= $(CXX) +LDFLAGS ?= -lpthread include $(KOKKOS_PATH)/Makefile.kokkos -ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - CXX = $(NVCC_WRAPPER) - CXXFLAGS ?= -O3 - LINK = $(CXX) - LDFLAGS ?= -lpthread -else - CXX ?= g++ - CXXFLAGS ?= -O3 - LINK ?= $(CXX) - LDFLAGS ?= -lpthread -endif - KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests TEST_TARGETS = diff --git a/lib/kokkos/containers/unit_tests/TestCuda.cpp b/lib/kokkos/containers/unit_tests/TestCuda.cpp index e30160b24e..6be38cd7a7 100644 --- a/lib/kokkos/containers/unit_tests/TestCuda.cpp +++ b/lib/kokkos/containers/unit_tests/TestCuda.cpp @@ -59,11 +59,13 @@ #include #include #include -#include #include #include +#include +#include + //---------------------------------------------------------------------------- @@ -133,11 +135,6 @@ void cuda_test_dualview_combinations(unsigned int size) test_dualview_combinations(size); } -void cuda_test_segmented_view(unsigned int size) -{ - test_segmented_view(size); -} - void cuda_test_bitset() { test_bitset(); @@ -184,11 +181,6 @@ void cuda_test_bitset() cuda_test_dualview_combinations(size); \ } -#define CUDA_SEGMENTEDVIEW_TEST( size ) \ - TEST_F( cuda, segmentedview_##size##x) { \ - cuda_test_segmented_view(size); \ - } - CUDA_DUALVIEW_COMBINE_TEST( 10 ) CUDA_VECTOR_COMBINE_TEST( 10 ) CUDA_VECTOR_COMBINE_TEST( 3057 ) @@ -198,7 +190,6 @@ CUDA_INSERT_TEST(close, 100000, 90000, 100, 500) CUDA_INSERT_TEST(far, 100000, 90000, 100, 500) CUDA_DEEP_COPY( 10000, 1 ) CUDA_FAILED_INSERT_TEST( 10000, 1000 ) -CUDA_SEGMENTEDVIEW_TEST( 200 ) #undef CUDA_INSERT_TEST @@ -207,7 +198,6 @@ CUDA_SEGMENTEDVIEW_TEST( 200 ) #undef CUDA_DEEP_COPY #undef CUDA_VECTOR_COMBINE_TEST #undef CUDA_DUALVIEW_COMBINE_TEST -#undef CUDA_SEGMENTEDVIEW_TEST TEST_F( cuda , dynamic_view ) @@ -221,6 +211,18 @@ TEST_F( cuda , dynamic_view ) } +#if defined(KOKKOS_CLASS_LAMBDA) +TEST_F(cuda, ErrorReporterViaLambda) +{ + TestErrorReporter>(); +} +#endif + +TEST_F(cuda, ErrorReporter) +{ + TestErrorReporter>(); +} + } #endif /* #ifdef KOKKOS_HAVE_CUDA */ diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp index e71ccc0091..d062778644 100644 --- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -715,9 +715,9 @@ public: typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ; typedef typename dView0::host_mirror_space host_drv_space ; - typedef Kokkos::Experimental::View< T , device > View0 ; - typedef Kokkos::Experimental::View< T* , device > View1 ; - typedef Kokkos::Experimental::View< T******* , device > View7 ; + typedef Kokkos::View< T , device > View0 ; + typedef Kokkos::View< T* , device > View1 ; + typedef Kokkos::View< T******* , device > View7 ; typedef typename View0::host_mirror_space host_view_space ; @@ -1127,8 +1127,7 @@ public: // T v2 = hx(0,0) ; // Generates compile error as intended // hx(0,0) = v2 ; // Generates compile error as intended -/* -#if ! KOKKOS_USING_EXP_VIEW +#if 0 /* Asynchronous deep copies not implemented for dynamic rank view */ // Testing with asynchronous deep copy with respect to device { size_t count = 0 ; @@ -1193,7 +1192,7 @@ public: { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); } }}}} } -#endif */ // #if ! KOKKOS_USING_EXP_VIEW +#endif // Testing with synchronous deep copy { diff --git a/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp b/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp new file mode 100644 index 0000000000..c431b62a53 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp @@ -0,0 +1,227 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_EXPERIMENTAL_ERROR_REPORTER_HPP +#define KOKKOS_TEST_EXPERIMENTAL_ERROR_REPORTER_HPP + +#include +#include +#include + +namespace Test { + +// Just save the data in the report. Informative text goies in the operator<<(..). +template +struct ThreeValReport +{ + DataType1 m_data1; + DataType2 m_data2; + DataType3 m_data3; + +}; + +template +std::ostream &operator<<(std::ostream & os, const ThreeValReport &val) +{ + return os << "{" << val.m_data1 << " " << val.m_data2 << " " << val.m_data3 << "}"; +} + +template +void checkReportersAndReportsAgree(const std::vector &reporters, + const std::vector &reports) +{ + for (size_t i = 0; i < reports.size(); ++i) { + EXPECT_EQ(1, reporters[i] % 2); + EXPECT_EQ(reporters[i], reports[i].m_data1); + } +} + + +template +struct ErrorReporterDriverBase { + + typedef ThreeValReport report_type; + typedef Kokkos::Experimental::ErrorReporter error_reporter_type; + error_reporter_type m_errorReporter; + + ErrorReporterDriverBase(int reporter_capacity, int test_size) + : m_errorReporter(reporter_capacity) { } + + KOKKOS_INLINE_FUNCTION bool error_condition(const int work_idx) const { return (work_idx % 2 != 0); } + + void check_expectations(int reporter_capacity, int test_size) + { + int num_reported = m_errorReporter.getNumReports(); + int num_attempts = m_errorReporter.getNumReportAttempts(); + + int expected_num_reports = std::min(reporter_capacity, test_size / 2); + EXPECT_EQ(expected_num_reports, num_reported); + EXPECT_EQ(test_size / 2, num_attempts); + + bool expect_full = (reporter_capacity <= (test_size / 2)); + bool reported_full = m_errorReporter.full(); + EXPECT_EQ(expect_full, reported_full); + } +}; + +template +void TestErrorReporter() +{ + typedef ErrorReporterDriverType tester_type; + std::vector reporters; + std::vector reports; + + tester_type test1(100, 10); + test1.m_errorReporter.getReports(reporters, reports); + checkReportersAndReportsAgree(reporters, reports); + + tester_type test2(10, 100); + test2.m_errorReporter.getReports(reporters, reports); + checkReportersAndReportsAgree(reporters, reports); + + typename Kokkos::View::HostMirror view_reporters; + typename Kokkos::View::HostMirror + view_reports; + test2.m_errorReporter.getReports(view_reporters, view_reports); + + int num_reports = view_reporters.extent(0); + reporters.clear(); + reports.clear(); + reporters.reserve(num_reports); + reports.reserve(num_reports); + + for (int i = 0; i < num_reports; ++i) { + reporters.push_back(view_reporters(i)); + reports.push_back(view_reports(i)); + } + checkReportersAndReportsAgree(reporters, reports); + +} + + +template +struct ErrorReporterDriver : public ErrorReporterDriverBase +{ + typedef ErrorReporterDriverBase driver_base; + typedef typename driver_base::error_reporter_type::execution_space execution_space; + + ErrorReporterDriver(int reporter_capacity, int test_size) + : driver_base(reporter_capacity, test_size) + { + execute(reporter_capacity, test_size); + + // Test that clear() and resize() work across memory spaces. + if (reporter_capacity < test_size) { + driver_base::m_errorReporter.clear(); + driver_base::m_errorReporter.resize(test_size); + execute(test_size, test_size); + } + } + + void execute(int reporter_capacity, int test_size) + { + Kokkos::parallel_for(Kokkos::RangePolicy(0,test_size), *this); + driver_base::check_expectations(reporter_capacity, test_size); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int work_idx) const + { + if (driver_base::error_condition(work_idx)) { + double val = M_PI * static_cast(work_idx); + typename driver_base::report_type report = {work_idx, -2*work_idx, val}; + driver_base::m_errorReporter.add_report(work_idx, report); + } + } +}; + +#if defined(KOKKOS_CLASS_LAMBDA) +template +struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase +{ + + typedef ErrorReporterDriverBase driver_base; + typedef typename driver_base::error_reporter_type::execution_space execution_space; + + ErrorReporterDriverUseLambda(int reporter_capacity, int test_size) + : driver_base(reporter_capacity, test_size) + { + Kokkos::parallel_for(Kokkos::RangePolicy(0,test_size), KOKKOS_CLASS_LAMBDA (const int work_idx) { + if (driver_base::error_condition(work_idx)) { + double val = M_PI * static_cast(work_idx); + typename driver_base::report_type report = {work_idx, -2*work_idx, val}; + driver_base::m_errorReporter.add_report(work_idx, report); + } + }); + driver_base::check_expectations(reporter_capacity, test_size); + } + +}; +#endif + + +#ifdef KOKKOS_HAVE_OPENMP +struct ErrorReporterDriverNativeOpenMP : public ErrorReporterDriverBase +{ + typedef ErrorReporterDriverBase driver_base; + typedef typename driver_base::error_reporter_type::execution_space execution_space; + + ErrorReporterDriverNativeOpenMP(int reporter_capacity, int test_size) + : driver_base(reporter_capacity, test_size) + { +#pragma omp parallel for + for(int work_idx = 0; work_idx < test_size; ++work_idx) + { + if (driver_base::error_condition(work_idx)) { + double val = M_PI * static_cast(work_idx); + typename driver_base::report_type report = {work_idx, -2*work_idx, val}; + driver_base::m_errorReporter.add_report(work_idx, report); + } + }; + driver_base::check_expectations(reporter_capacity, test_size); + } +}; +#endif + +} // namespace Test +#endif // #ifndef KOKKOS_TEST_ERROR_REPORTING_HPP diff --git a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp index a4319f39ff..598a296c78 100644 --- a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp +++ b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp @@ -56,12 +56,14 @@ #include #include #include -#include #include #include #include +#include +#include + #include namespace Test { @@ -143,11 +145,6 @@ TEST_F( openmp , staticcrsgraph ) test_dualview_combinations(size); \ } -#define OPENMP_SEGMENTEDVIEW_TEST( size ) \ - TEST_F( openmp, segmentedview_##size##x) { \ - test_segmented_view(size); \ - } - OPENMP_INSERT_TEST(close, 100000, 90000, 100, 500, true) OPENMP_INSERT_TEST(far, 100000, 90000, 100, 500, false) OPENMP_FAILED_INSERT_TEST( 10000, 1000 ) @@ -156,7 +153,6 @@ OPENMP_DEEP_COPY( 10000, 1 ) OPENMP_VECTOR_COMBINE_TEST( 10 ) OPENMP_VECTOR_COMBINE_TEST( 3057 ) OPENMP_DUALVIEW_COMBINE_TEST( 10 ) -OPENMP_SEGMENTEDVIEW_TEST( 10000 ) #undef OPENMP_INSERT_TEST #undef OPENMP_FAILED_INSERT_TEST @@ -164,7 +160,6 @@ OPENMP_SEGMENTEDVIEW_TEST( 10000 ) #undef OPENMP_DEEP_COPY #undef OPENMP_VECTOR_COMBINE_TEST #undef OPENMP_DUALVIEW_COMBINE_TEST -#undef OPENMP_SEGMENTEDVIEW_TEST #endif @@ -178,5 +173,22 @@ TEST_F( openmp , dynamic_view ) } } +#if defined(KOKKOS_CLASS_LAMBDA) +TEST_F(openmp, ErrorReporterViaLambda) +{ + TestErrorReporter>(); +} +#endif + +TEST_F(openmp, ErrorReporter) +{ + TestErrorReporter>(); +} + +TEST_F(openmp, ErrorReporterNativeOpenMP) +{ + TestErrorReporter(); +} + } // namespace test diff --git a/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp b/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp deleted file mode 100644 index bfd66d12a7..0000000000 --- a/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp +++ /dev/null @@ -1,708 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP -#define KOKKOS_TEST_SEGMENTEDVIEW_HPP - -#include -#include -#include -#include -#include - -#if ! KOKKOS_USING_EXP_VIEW - -#include -#include - -namespace Test { - -namespace Impl { - - template - struct GrowTest; - - template - struct GrowTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - GrowTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - a.grow(team_member , team_idx+team_member.team_size()); - value += team_idx + team_member.team_rank(); - - if((a.dimension_0()>team_idx+team_member.team_rank()) && - (a.dimension(0)>team_idx+team_member.team_rank())) - a(team_idx+team_member.team_rank()) = team_idx+team_member.team_rank(); - - } - }; - - template - struct GrowTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - GrowTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - a.grow(team_member , team_idx+ team_member.team_size()); - - for( typename ExecutionSpace::size_type k=0;k<7;k++) - value += team_idx + team_member.team_rank() + 13*k; - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct GrowTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - GrowTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - a.grow(team_member , team_idx+ team_member.team_size()); - - for( typename ExecutionSpace::size_type k=0;k<7;k++) - for( typename ExecutionSpace::size_type l=0;l<3;l++) - value += team_idx + team_member.team_rank() + 13*k + 3*l; - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct GrowTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - GrowTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - a.grow(team_member , team_idx+ team_member.team_size()); - - for( typename ExecutionSpace::size_type k=0;k<7;k++) - for( typename ExecutionSpace::size_type l=0;l<3;l++) - for( typename ExecutionSpace::size_type m=0;m<2;m++) - value += team_idx + team_member.team_rank() + 13*k + 3*l + 7*m; - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct GrowTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - GrowTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - a.grow(team_member , team_idx+ team_member.team_size()); - - for( typename ExecutionSpace::size_type k=0;k<7;k++) - for( typename ExecutionSpace::size_type l=0;l<3;l++) - for( typename ExecutionSpace::size_type m=0;m<2;m++) - for( typename ExecutionSpace::size_type n=0;n<3;n++) - value += - team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n; - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct GrowTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - GrowTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - a.grow(team_member , team_idx+ team_member.team_size()); - - for( typename ExecutionSpace::size_type k=0;k<7;k++) - for( typename ExecutionSpace::size_type l=0;l<3;l++) - for( typename ExecutionSpace::size_type m=0;m<2;m++) - for( typename ExecutionSpace::size_type n=0;n<3;n++) - for( typename ExecutionSpace::size_type o=0;o<2;o++) - value += - team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ; - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct GrowTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - GrowTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - a.grow(team_member , team_idx+ team_member.team_size()); - - for( typename ExecutionSpace::size_type k=0;k<7;k++) - for( typename ExecutionSpace::size_type l=0;l<3;l++) - for( typename ExecutionSpace::size_type m=0;m<2;m++) - for( typename ExecutionSpace::size_type n=0;n<3;n++) - for( typename ExecutionSpace::size_type o=0;o<2;o++) - for( typename ExecutionSpace::size_type p=0;p<4;p++) - value += - team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ; - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct GrowTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - GrowTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - a.grow(team_member , team_idx + team_member.team_size()); - - for( typename ExecutionSpace::size_type k=0;k<7;k++) - for( typename ExecutionSpace::size_type l=0;l<3;l++) - for( typename ExecutionSpace::size_type m=0;m<2;m++) - for( typename ExecutionSpace::size_type n=0;n<3;n++) - for( typename ExecutionSpace::size_type o=0;o<2;o++) - for( typename ExecutionSpace::size_type p=0;p<4;p++) - for( typename ExecutionSpace::size_type q=0;q<3;q++) - value += - team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q; - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct VerifyTest; - - template - struct VerifyTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - VerifyTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - value += a(team_idx+ team_member.team_rank()); - } - } - }; - - template - struct VerifyTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - VerifyTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct VerifyTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - VerifyTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct VerifyTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - VerifyTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct VerifyTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - VerifyTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct VerifyTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - VerifyTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct VerifyTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - VerifyTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct VerifyTest { - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - typedef typename Policy::member_type team_type; - typedef double value_type; - - ViewType a; - - VerifyTest(ViewType in):a(in) {} - - KOKKOS_INLINE_FUNCTION - void operator() (team_type team_member, double& value) const { - unsigned int team_idx = team_member.league_rank() * team_member.team_size(); - - if((a.dimension_0()>team_idx+ team_member.team_rank()) && - (a.dimension(0)>team_idx+ team_member.team_rank())) { - for( typename ExecutionSpace::size_type k=0;k - struct test_segmented_view - { - typedef test_segmented_view self_type; - - typedef Scalar scalar_type; - typedef ExecutionSpace execution_space; - typedef Kokkos::TeamPolicy Policy; - - double result; - double reference; - - template - void run_me(ViewType a, int max_length){ - const int team_size = Policy::team_size_max( GrowTest(a) ); - const int nteams = max_length/team_size; - - reference = 0; - result = 0; - - Kokkos::parallel_reduce(Policy(nteams,team_size),GrowTest(a),reference); - Kokkos::fence(); - Kokkos::parallel_reduce(Policy(nteams,team_size),VerifyTest(a),result); - Kokkos::fence(); - } - - - test_segmented_view(unsigned int size,int rank) - { - reference = 0; - result = 0; - - const int dim_1 = 7; - const int dim_2 = 3; - const int dim_3 = 2; - const int dim_4 = 3; - const int dim_5 = 2; - const int dim_6 = 4; - //const int dim_7 = 3; - - if(rank==1) { - typedef Kokkos::Experimental::SegmentedView rank1_view; - run_me< rank1_view >(rank1_view("Rank1",128,size), size); - } - if(rank==2) { - typedef Kokkos::Experimental::SegmentedView rank2_view; - run_me< rank2_view >(rank2_view("Rank2",128,size,dim_1), size); - } - if(rank==3) { - typedef Kokkos::Experimental::SegmentedView rank3_view; - run_me< rank3_view >(rank3_view("Rank3",128,size), size); - } - if(rank==4) { - typedef Kokkos::Experimental::SegmentedView rank4_view; - run_me< rank4_view >(rank4_view("Rank4",128,size,dim_1,dim_2,dim_3), size); - } - if(rank==5) { - typedef Kokkos::Experimental::SegmentedView rank5_view; - run_me< rank5_view >(rank5_view("Rank5",128,size), size); - } - if(rank==6) { - typedef Kokkos::Experimental::SegmentedView rank6_view; - run_me< rank6_view >(rank6_view("Rank6",128,size,dim_1,dim_2,dim_3,dim_4), size); - } - if(rank==7) { - typedef Kokkos::Experimental::SegmentedView rank7_view; - run_me< rank7_view >(rank7_view("Rank7",128,size,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6), size); - } - if(rank==8) { - typedef Kokkos::Experimental::SegmentedView rank8_view; - run_me< rank8_view >(rank8_view("Rank8",128,size,dim_1,dim_2,dim_3,dim_4), size); - } - } - - }; - -} // namespace Impl - - - - -template -void test_segmented_view(unsigned int size) -{ - { - typedef Kokkos::Experimental::SegmentedView view_type; - view_type a("A",128,size,7,3,2,3); - double reference; - - Impl::GrowTest f(a); - - const int team_size = Kokkos::TeamPolicy::team_size_max( f ); - const int nteams = (size+team_size-1)/team_size; - - Kokkos::parallel_reduce(Kokkos::TeamPolicy(nteams,team_size),f,reference); - - size_t real_size = ((size+127)/128)*128; - - ASSERT_EQ(real_size,a.dimension_0()); - ASSERT_EQ(7,a.dimension_1()); - ASSERT_EQ(3,a.dimension_2()); - ASSERT_EQ(2,a.dimension_3()); - ASSERT_EQ(3,a.dimension_4()); - ASSERT_EQ(2,a.dimension_5()); - ASSERT_EQ(4,a.dimension_6()); - ASSERT_EQ(3,a.dimension_7()); - ASSERT_EQ(real_size,a.dimension(0)); - ASSERT_EQ(7,a.dimension(1)); - ASSERT_EQ(3,a.dimension(2)); - ASSERT_EQ(2,a.dimension(3)); - ASSERT_EQ(3,a.dimension(4)); - ASSERT_EQ(2,a.dimension(5)); - ASSERT_EQ(4,a.dimension(6)); - ASSERT_EQ(3,a.dimension(7)); - ASSERT_EQ(8,a.Rank); - } - { - Impl::test_segmented_view test(size,1); - ASSERT_EQ(test.reference,test.result); - } - { - Impl::test_segmented_view test(size,2); - ASSERT_EQ(test.reference,test.result); - } - { - Impl::test_segmented_view test(size,3); - ASSERT_EQ(test.reference,test.result); - } - { - Impl::test_segmented_view test(size,4); - ASSERT_EQ(test.reference,test.result); - } - { - Impl::test_segmented_view test(size,5); - ASSERT_EQ(test.reference,test.result); - } - { - Impl::test_segmented_view test(size,6); - ASSERT_EQ(test.reference,test.result); - } - { - Impl::test_segmented_view test(size,7); - ASSERT_EQ(test.reference,test.result); - } - { - Impl::test_segmented_view test(size,8); - ASSERT_EQ(test.reference,test.result); - } - -} - - -} // namespace Test - -#else - -template -void test_segmented_view(unsigned int ) {} - -#endif - -#endif /* #ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP */ - diff --git a/lib/kokkos/containers/unit_tests/TestSerial.cpp b/lib/kokkos/containers/unit_tests/TestSerial.cpp index a7c42d2798..2be27ea613 100644 --- a/lib/kokkos/containers/unit_tests/TestSerial.cpp +++ b/lib/kokkos/containers/unit_tests/TestSerial.cpp @@ -58,7 +58,6 @@ #include #include #include -#include #include #include @@ -67,6 +66,9 @@ #include #include +#include +#include + namespace Test { class serial : public ::testing::Test { @@ -135,11 +137,6 @@ TEST_F( serial, bitset ) test_dualview_combinations(size); \ } -#define SERIAL_SEGMENTEDVIEW_TEST( size ) \ - TEST_F( serial, segmentedview_##size##x) { \ - test_segmented_view(size); \ - } - SERIAL_INSERT_TEST(close, 100000, 90000, 100, 500, true) SERIAL_INSERT_TEST(far, 100000, 90000, 100, 500, false) SERIAL_FAILED_INSERT_TEST( 10000, 1000 ) @@ -148,7 +145,6 @@ SERIAL_DEEP_COPY( 10000, 1 ) SERIAL_VECTOR_COMBINE_TEST( 10 ) SERIAL_VECTOR_COMBINE_TEST( 3057 ) SERIAL_DUALVIEW_COMBINE_TEST( 10 ) -SERIAL_SEGMENTEDVIEW_TEST( 10000 ) #undef SERIAL_INSERT_TEST #undef SERIAL_FAILED_INSERT_TEST @@ -156,7 +152,6 @@ SERIAL_SEGMENTEDVIEW_TEST( 10000 ) #undef SERIAL_DEEP_COPY #undef SERIAL_VECTOR_COMBINE_TEST #undef SERIAL_DUALVIEW_COMBINE_TEST -#undef SERIAL_SEGMENTEDVIEW_TEST TEST_F( serial , dynamic_view ) { @@ -168,6 +163,19 @@ TEST_F( serial , dynamic_view ) } } +#if defined(KOKKOS_CLASS_LAMBDA) +TEST_F(serial, ErrorReporterViaLambda) +{ + TestErrorReporter>(); +} +#endif + +TEST_F(serial, ErrorReporter) +{ + TestErrorReporter>(); +} + + } // namespace Test #endif // KOKKOS_HAVE_SERIAL diff --git a/lib/kokkos/containers/unit_tests/TestThreads.cpp b/lib/kokkos/containers/unit_tests/TestThreads.cpp index 58277528d3..3b34006a01 100644 --- a/lib/kokkos/containers/unit_tests/TestThreads.cpp +++ b/lib/kokkos/containers/unit_tests/TestThreads.cpp @@ -62,11 +62,13 @@ #include #include #include -#include #include #include +#include +#include + namespace Test { class threads : public ::testing::Test { @@ -145,12 +147,6 @@ TEST_F( threads , staticcrsgraph ) test_dualview_combinations(size); \ } -#define THREADS_SEGMENTEDVIEW_TEST( size ) \ - TEST_F( threads, segmentedview_##size##x) { \ - test_segmented_view(size); \ - } - - THREADS_INSERT_TEST(far, 100000, 90000, 100, 500, false) THREADS_FAILED_INSERT_TEST( 10000, 1000 ) THREADS_DEEP_COPY( 10000, 1 ) @@ -158,7 +154,6 @@ THREADS_DEEP_COPY( 10000, 1 ) THREADS_VECTOR_COMBINE_TEST( 10 ) THREADS_VECTOR_COMBINE_TEST( 3057 ) THREADS_DUALVIEW_COMBINE_TEST( 10 ) -THREADS_SEGMENTEDVIEW_TEST( 10000 ) #undef THREADS_INSERT_TEST @@ -167,8 +162,6 @@ THREADS_SEGMENTEDVIEW_TEST( 10000 ) #undef THREADS_DEEP_COPY #undef THREADS_VECTOR_COMBINE_TEST #undef THREADS_DUALVIEW_COMBINE_TEST -#undef THREADS_SEGMENTEDVIEW_TEST - TEST_F( threads , dynamic_view ) @@ -181,6 +174,19 @@ TEST_F( threads , dynamic_view ) } } + +#if defined(KOKKOS_CLASS_LAMBDA) +TEST_F(threads, ErrorReporterViaLambda) +{ + TestErrorReporter>(); +} +#endif + +TEST_F(threads, ErrorReporter) +{ + TestErrorReporter>(); +} + } // namespace Test diff --git a/lib/kokkos/core/cmake/Dependencies.cmake b/lib/kokkos/core/cmake/Dependencies.cmake index 34ff0be5d3..ae9a20c50e 100644 --- a/lib/kokkos/core/cmake/Dependencies.cmake +++ b/lib/kokkos/core/cmake/Dependencies.cmake @@ -2,3 +2,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD DLlib TEST_OPTIONAL_TPLS CUSPARSE ) + +TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) \ No newline at end of file diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in index 27e3ba1c31..9359b5a32b 100644 --- a/lib/kokkos/core/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in @@ -45,6 +45,16 @@ #define KOKKOS_ENABLE_PROFILING 0 #endif +#cmakedefine KOKKOS_HAVE_CUDA_RDC +#ifdef KOKKOS_HAVE_CUDA_RDC +#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1 +#endif + +#cmakedefine KOKKOS_HAVE_CUDA_LAMBDA +#ifdef KOKKOS_HAVE_CUDA_LAMBDA +#define KOKKOS_CUDA_USE_LAMBDA 1 +#endif + // Don't forbid users from defining this macro on the command line, // but still make sure that CMake logic can control its definition. #if ! defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index d93ca14d96..cae52f1409 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -1,6 +1,6 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINRARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) SET(SOURCES PerfTestMain.cpp @@ -19,7 +19,7 @@ TRIBITS_ADD_EXECUTABLE( TESTONLYLIBS kokkos_gtest ) -TRIBITS_ADD_EXECUTABLE_AND_TEST( +TRIBITS_ADD_TEST( PerfTest NAME PerfTestExec COMM serial mpi diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile index 8fa1fbfc3c..85f869971a 100644 --- a/lib/kokkos/core/perf_test/Makefile +++ b/lib/kokkos/core/perf_test/Makefile @@ -7,21 +7,18 @@ vpath %.cpp ${KOKKOS_PATH}/core/perf_test default: build_all echo "End Build" +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) + CXX = $(KOKKOS_PATH)/config/nvcc_wrapper +else + CXX = g++ +endif + +CXXFLAGS = -O3 +LINK ?= $(CXX) +LDFLAGS ?= -lpthread include $(KOKKOS_PATH)/Makefile.kokkos -ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - CXX = $(NVCC_WRAPPER) - CXXFLAGS ?= -O3 - LINK = $(CXX) - LDFLAGS ?= -lpthread -else - CXX ?= g++ - CXXFLAGS ?= -O3 - LINK ?= $(CXX) - LDFLAGS ?= -lpthread -endif - KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test TEST_TARGETS = diff --git a/lib/kokkos/core/perf_test/PerfTestHost.cpp b/lib/kokkos/core/perf_test/PerfTestHost.cpp index 6a0f2efada..4a05eecfe0 100644 --- a/lib/kokkos/core/perf_test/PerfTestHost.cpp +++ b/lib/kokkos/core/perf_test/PerfTestHost.cpp @@ -79,10 +79,21 @@ class host : public ::testing::Test { protected: static void SetUpTestCase() { - const unsigned team_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned threads_per_team = 4 ; + if(Kokkos::hwloc::available()) { + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); - TestHostDevice::initialize( team_count * threads_per_team ); + unsigned threads_count = 0 ; + + threads_count = std::max( 1u , numa_count ) + * std::max( 2u , cores_per_numa * threads_per_core ); + + TestHostDevice::initialize( threads_count ); + } else { + const unsigned thread_count = 4 ; + TestHostDevice::initialize( thread_count ); + } } static void TearDownTestCase() diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp deleted file mode 100644 index 4ed7d8e2a8..0000000000 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp +++ /dev/null @@ -1,334 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP -#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP - -/* only compile this file if CUDA is enabled for Kokkos */ -#if defined( KOKKOS_HAVE_CUDA ) - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -template<> -struct ViewOperatorBoundsErrorAbort< Kokkos::CudaSpace > { - KOKKOS_INLINE_FUNCTION - static void apply( const size_t rank - , const size_t n0 , const size_t n1 - , const size_t n2 , const size_t n3 - , const size_t n4 , const size_t n5 - , const size_t n6 , const size_t n7 - , const size_t i0 , const size_t i1 - , const size_t i2 , const size_t i3 - , const size_t i4 , const size_t i5 - , const size_t i6 , const size_t i7 ) - { - const int r = - ( n0 <= i0 ? 0 : - ( n1 <= i1 ? 1 : - ( n2 <= i2 ? 2 : - ( n3 <= i3 ? 3 : - ( n4 <= i4 ? 4 : - ( n5 <= i5 ? 5 : - ( n6 <= i6 ? 6 : 7 ))))))); - const size_t n = - ( n0 <= i0 ? n0 : - ( n1 <= i1 ? n1 : - ( n2 <= i2 ? n2 : - ( n3 <= i3 ? n3 : - ( n4 <= i4 ? n4 : - ( n5 <= i5 ? n5 : - ( n6 <= i6 ? n6 : n7 ))))))); - const size_t i = - ( n0 <= i0 ? i0 : - ( n1 <= i1 ? i1 : - ( n2 <= i2 ? i2 : - ( n3 <= i3 ? i3 : - ( n4 <= i4 ? i4 : - ( n5 <= i5 ? i5 : - ( n6 <= i6 ? i6 : i7 ))))))); - printf("Cuda view array bounds error index %d : FAILED %lu < %lu\n" , r , i , n ); - Kokkos::Impl::cuda_abort("Cuda view array bounds error"); - } -}; - -} // namespace Impl -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4) -// Via reinterpret_case this can be used to support all scalar types of those sizes. -// Any other scalar type falls back to either normal reads out of global memory, -// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0) - -template< typename ValueType , typename AliasType > -struct CudaTextureFetch { - - ::cudaTextureObject_t m_obj ; - const ValueType * m_ptr ; - int m_offset ; - - // Deference operator pulls through texture object and returns by value - template< typename iType > - KOKKOS_INLINE_FUNCTION - ValueType operator[]( const iType & i ) const - { -#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) - AliasType v = tex1Dfetch( m_obj , i + m_offset ); - return *(reinterpret_cast (&v)); -#else - return m_ptr[ i ]; -#endif - } - - // Pointer to referenced memory - KOKKOS_INLINE_FUNCTION - operator const ValueType * () const { return m_ptr ; } - - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {} - - KOKKOS_INLINE_FUNCTION - ~CudaTextureFetch() {} - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch( const CudaTextureFetch & rhs ) - : m_obj( rhs.m_obj ) - , m_ptr( rhs.m_ptr ) - , m_offset( rhs.m_offset ) - {} - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch( CudaTextureFetch && rhs ) - : m_obj( rhs.m_obj ) - , m_ptr( rhs.m_ptr ) - , m_offset( rhs.m_offset ) - {} - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) - { - m_obj = rhs.m_obj ; - m_ptr = rhs.m_ptr ; - m_offset = rhs.m_offset ; - return *this ; - } - - KOKKOS_INLINE_FUNCTION - CudaTextureFetch & operator = ( CudaTextureFetch && rhs ) - { - m_obj = rhs.m_obj ; - m_ptr = rhs.m_ptr ; - m_offset = rhs.m_offset ; - return *this ; - } - - // Texture object spans the entire allocation. - // This handle may view a subset of the allocation, so an offset is required. - template< class CudaMemorySpace > - inline explicit - CudaTextureFetch( const ValueType * const arg_ptr - , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record - ) - : m_obj( record.template attach_texture_object< AliasType >() ) - , m_ptr( arg_ptr ) - , m_offset( record.attach_texture_object_offset( reinterpret_cast( arg_ptr ) ) ) - {} -}; - -#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC ) - -template< typename ValueType , typename AliasType > -struct CudaLDGFetch { - - const ValueType * m_ptr ; - - template< typename iType > - KOKKOS_INLINE_FUNCTION - ValueType operator[]( const iType & i ) const - { - AliasType v = __ldg(reinterpret_cast(&m_ptr[i])); - return *(reinterpret_cast (&v)); - } - - KOKKOS_INLINE_FUNCTION - operator const ValueType * () const { return m_ptr ; } - - KOKKOS_INLINE_FUNCTION - CudaLDGFetch() : m_ptr() {} - - KOKKOS_INLINE_FUNCTION - ~CudaLDGFetch() {} - - KOKKOS_INLINE_FUNCTION - CudaLDGFetch( const CudaLDGFetch & rhs ) - : m_ptr( rhs.m_ptr ) - {} - - KOKKOS_INLINE_FUNCTION - CudaLDGFetch( CudaLDGFetch && rhs ) - : m_ptr( rhs.m_ptr ) - {} - - KOKKOS_INLINE_FUNCTION - CudaLDGFetch & operator = ( const CudaLDGFetch & rhs ) - { - m_ptr = rhs.m_ptr ; - return *this ; - } - - KOKKOS_INLINE_FUNCTION - CudaLDGFetch & operator = ( CudaLDGFetch && rhs ) - { - m_ptr = rhs.m_ptr ; - return *this ; - } - - template< class CudaMemorySpace > - inline explicit - CudaTextureFetch( const ValueType * const arg_ptr - , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const & - ) - : m_ptr( arg_data_ptr ) - {} -}; - -#endif - -} // namespace Impl -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization - * if 'const' value type, CudaSpace and random access. - */ -template< class Traits > -class ViewDataHandle< Traits , - typename std::enable_if<( - // Is Cuda memory space - ( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value || - std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value ) - && - // Is a trivial const value of 4, 8, or 16 bytes - std::is_trivial::value - && - std::is_same::value - && - ( sizeof(typename Traits::const_value_type) == 4 || - sizeof(typename Traits::const_value_type) == 8 || - sizeof(typename Traits::const_value_type) == 16 ) - && - // Random access trait - ( Traits::memory_traits::RandomAccess != 0 ) - )>::type > -{ -public: - - using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ; - - using value_type = typename Traits::const_value_type ; - using return_type = typename Traits::const_value_type ; // NOT a reference - - using alias_type = typename std::conditional< ( sizeof(value_type) == 4 ) , int , - typename std::conditional< ( sizeof(value_type) == 8 ) , ::int2 , - typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void - >::type - >::type - >::type ; - -#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC ) - using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ; -#else - using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ; -#endif - - KOKKOS_INLINE_FUNCTION - static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ ) - { - return arg_handle ; - } - - KOKKOS_INLINE_FUNCTION - static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker ) - { -#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) - // Assignment of texture = non-texture requires creation of a texture object - // which can only occur on the host. In addition, 'get_record' is only valid - // if called in a host execution space - return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() ); -#else - Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel"); - return handle_type(); -#endif - } -}; - -} -} -} - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_HAVE_CUDA ) */ -#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */ - diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index a4f372d65d..8abf2292d9 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -46,6 +46,7 @@ #include #include #include +#include #include /* only compile this file if CUDA is enabled for Kokkos */ @@ -58,6 +59,11 @@ #include #include +#if (KOKKOS_ENABLE_PROFILING) +#include +#endif + + /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ @@ -65,6 +71,9 @@ namespace Kokkos { namespace Impl { namespace { + + static std::atomic num_uvm_allocations(0) ; + cudaStream_t get_deep_copy_stream() { static cudaStream_t s = 0; if( s == 0) { @@ -119,6 +128,7 @@ void CudaSpace::access_error( const void * const ) Kokkos::Impl::throw_runtime_exception( msg ); } + /*--------------------------------------------------------------------------*/ bool CudaUVMSpace::available() @@ -133,6 +143,11 @@ bool CudaUVMSpace::available() /*--------------------------------------------------------------------------*/ +int CudaUVMSpace::number_of_allocations() +{ + return Kokkos::Impl::num_uvm_allocations.load(); +} + } // namespace Kokkos /*--------------------------------------------------------------------------*/ @@ -167,7 +182,18 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const { void * ptr = NULL; - CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) ); + enum { max_uvm_allocations = 65536 }; + + if ( arg_alloc_size > 0 ) + { + Kokkos::Impl::num_uvm_allocations++; + + if ( Kokkos::Impl::num_uvm_allocations.load() > max_uvm_allocations ) { + Kokkos::Impl::throw_runtime_exception( "CudaUVM error: The maximum limit of UVM allocations exceeded (currently 65536)." ) ; + } + + CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) ); + } return ptr ; } @@ -191,7 +217,10 @@ void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_all void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const { try { - CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) ); + if ( arg_alloc_ptr != nullptr ) { + Kokkos::Impl::num_uvm_allocations--; + CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) ); + } } catch(...) {} } @@ -202,13 +231,24 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t } catch(...) {} } +constexpr const char* CudaSpace::name() { + return m_name; +} + +constexpr const char* CudaUVMSpace::name() { + return m_name; +} + +constexpr const char* CudaHostPinnedSpace::name() { + return m_name; +} + } // namespace Kokkos //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { SharedAllocationRecord< void , void > @@ -335,6 +375,18 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec ) SharedAllocationRecord< Kokkos::CudaSpace , void >:: ~SharedAllocationRecord() { + #if (KOKKOS_ENABLE_PROFILING) + if(Kokkos::Profiling::profileLibraryLoaded()) { + + SharedAllocationHeader header ; + Kokkos::Impl::DeepCopy::DeepCopy( & header , RecordBase::m_alloc_ptr , sizeof(SharedAllocationHeader) ); + + Kokkos::Profiling::deallocateData( + Kokkos::Profiling::SpaceHandle(Kokkos::CudaSpace::name()),header.m_label, + data(),size()); + } + #endif + m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr , SharedAllocationRecord< void , void >::m_alloc_size ); @@ -343,6 +395,15 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >:: SharedAllocationRecord< Kokkos::CudaUVMSpace , void >:: ~SharedAllocationRecord() { + #if (KOKKOS_ENABLE_PROFILING) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::fence(); //Make sure I can access the label ... + Kokkos::Profiling::deallocateData( + Kokkos::Profiling::SpaceHandle(Kokkos::CudaUVMSpace::name()),RecordBase::m_alloc_ptr->m_label, + data(),size()); + } + #endif + m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr , SharedAllocationRecord< void , void >::m_alloc_size ); @@ -351,6 +412,14 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >:: SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >:: ~SharedAllocationRecord() { + #if (KOKKOS_ENABLE_PROFILING) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::deallocateData( + Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label, + data(),size()); + } + #endif + m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr , SharedAllocationRecord< void , void >::m_alloc_size ); @@ -373,6 +442,12 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space , m_tex_obj( 0 ) , m_space( arg_space ) { + #if (KOKKOS_ENABLE_PROFILING) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size); + } + #endif + SharedAllocationHeader header ; // Fill in the Header information @@ -404,7 +479,12 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space , m_tex_obj( 0 ) , m_space( arg_space ) { - // Fill in the Header information, directly accessible via UVM + #if (KOKKOS_ENABLE_PROFILING) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size); + } + #endif + // Fill in the Header information, directly accessible via UVM RecordBase::m_alloc_ptr->m_record = this ; @@ -430,6 +510,11 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space ) , m_space( arg_space ) { + #if (KOKKOS_ENABLE_PROFILING) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size); + } + #endif // Fill in the Header information, directly accessible via UVM RecordBase::m_alloc_ptr->m_record = this ; @@ -502,6 +587,7 @@ void SharedAllocationRecord< Kokkos::CudaUVMSpace , void >:: deallocate_tracked( void * const arg_alloc_ptr ) { if ( arg_alloc_ptr != 0 ) { + SharedAllocationRecord * const r = get_record( arg_alloc_ptr ); RecordBase::decrement( r ); @@ -587,7 +673,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr RecordCuda * const record = alloc_ptr ? static_cast< RecordCuda * >( head.m_record ) : (RecordCuda *) 0 ; if ( ! alloc_ptr || record->m_alloc_ptr != head_cuda ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) ); + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) ); } #else @@ -598,7 +684,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr RecordCuda * const record = static_cast< RecordCuda * >( RecordBase::find( & s_root_record , alloc_ptr ) ); if ( record == 0 ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) ); + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) ); } #endif @@ -615,7 +701,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_ Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ; if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) ); + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) ); } return static_cast< RecordCuda * >( h->m_record ); @@ -630,7 +716,7 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void * Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ; if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) { - Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) ); + Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) ); } return static_cast< RecordCuda * >( h->m_record ); @@ -728,7 +814,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo } } // namespace Impl -} // namespace Experimental } // namespace Kokkos /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp index 2d8d07d077..59e79bba25 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp @@ -384,10 +384,10 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) const bool ok_id = 0 <= cuda_device_id && cuda_device_id < dev_info.m_cudaDevCount ; - // Need device capability 2.0 or better + // Need device capability 3.0 or better const bool ok_dev = ok_id && - ( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major && + ( 3 <= dev_info.m_cudaProp[ cuda_device_id ].major && 0 <= dev_info.m_cudaProp[ cuda_device_id ].minor ); if ( ok_init && ok_dev ) { @@ -444,7 +444,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) //---------------------------------- // Maximum number of blocks: - m_maxBlock = m_cudaArch < 300 ? 65535 : cudaProp.maxGridSize[0] ; + m_maxBlock = cudaProp.maxGridSize[0] ; //---------------------------------- @@ -495,7 +495,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) msg << dev_info.m_cudaProp[ cuda_device_id ].major ; msg << "." ; msg << dev_info.m_cudaProp[ cuda_device_id ].minor ; - msg << " has insufficient capability, required 2.0 or better" ; + msg << " has insufficient capability, required 3.0 or better" ; } Kokkos::Impl::throw_runtime_exception( msg.str() ); } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index 7afa06fdf5..12a639fd44 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -95,27 +95,42 @@ private: public: -#if defined( __CUDA_ARCH__ ) - - __device__ inline + KOKKOS_INLINE_FUNCTION const execution_space::scratch_memory_space & team_shmem() const { return m_team_shared.set_team_thread_mode(0,1,0) ; } - __device__ inline + KOKKOS_INLINE_FUNCTION const execution_space::scratch_memory_space & team_scratch(const int& level) const { return m_team_shared.set_team_thread_mode(level,1,0) ; } - __device__ inline + KOKKOS_INLINE_FUNCTION const execution_space::scratch_memory_space & thread_scratch(const int& level) const { return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; } - __device__ inline int league_rank() const { return m_league_rank ; } - __device__ inline int league_size() const { return m_league_size ; } - __device__ inline int team_rank() const { return threadIdx.y ; } - __device__ inline int team_size() const { return blockDim.y ; } + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; } + KOKKOS_INLINE_FUNCTION int team_rank() const { + #ifdef __CUDA_ARCH__ + return threadIdx.y ; + #else + return 1; + #endif + } + KOKKOS_INLINE_FUNCTION int team_size() const { + #ifdef __CUDA_ARCH__ + return blockDim.y ; + #else + return 1; + #endif + } - __device__ inline void team_barrier() const { __syncthreads(); } + KOKKOS_INLINE_FUNCTION void team_barrier() const { + #ifdef __CUDA_ARCH__ + __syncthreads(); + #endif + } template - __device__ inline void team_broadcast(ValueType& value, const int& thread_id) const { + KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, const int& thread_id) const { + #ifdef __CUDA_ARCH__ __shared__ ValueType sh_val; if(threadIdx.x == 0 && threadIdx.y == thread_id) { sh_val = value; @@ -123,26 +138,17 @@ public: team_barrier(); value = sh_val; team_barrier(); + #endif } -#ifdef KOKKOS_HAVE_CXX11 template< class ValueType, class JoinOp > - __device__ inline + KOKKOS_INLINE_FUNCTION typename JoinOp::value_type team_reduce( const ValueType & value - , const JoinOp & op_in ) const - { + , const JoinOp & op_in ) const { + #ifdef __CUDA_ARCH__ typedef JoinLambdaAdapter JoinOpFunctor ; const JoinOpFunctor op(op_in); ValueType * const base_data = (ValueType *) m_team_reduce ; -#else - template< class JoinOp > - __device__ inline - typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value - , const JoinOp & op ) const - { - typedef JoinOp JoinOpFunctor ; - typename JoinOp::value_type * const base_data = (typename JoinOp::value_type *) m_team_reduce ; -#endif __syncthreads(); // Don't write in to shared data until all threads have entered this function @@ -153,6 +159,9 @@ public: Impl::cuda_intra_block_reduce_scan( op , base_data ); return base_data[ blockDim.y - 1 ]; + #else + return typename JoinOp::value_type(); + #endif } /** \brief Intra-team exclusive prefix sum with team_rank() ordering @@ -165,8 +174,8 @@ public: * non-deterministic. */ template< typename Type > - __device__ inline Type team_scan( const Type & value , Type * const global_accum ) const - { + KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const { + #ifdef __CUDA_ARCH__ Type * const base_data = (Type *) m_team_reduce ; __syncthreads(); // Don't write in to shared data until all threads have entered this function @@ -186,6 +195,9 @@ public: } return base_data[ threadIdx.y ]; + #else + return Type(); + #endif } /** \brief Intra-team exclusive prefix sum with team_rank() ordering. @@ -194,13 +206,14 @@ public: * reduction_total = dev.team_scan( value ) + value ; */ template< typename Type > - __device__ inline Type team_scan( const Type & value ) const - { return this->template team_scan( value , 0 ); } + KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const { + return this->template team_scan( value , 0 ); + } //---------------------------------------- // Private for the driver - __device__ inline + KOKKOS_INLINE_FUNCTION CudaTeamMember( void * shared , const int shared_begin , const int shared_size @@ -210,51 +223,10 @@ public: , const int arg_league_size ) : m_team_reduce( shared ) , m_team_shared( ((char *)shared) + shared_begin , shared_size, scratch_level_1_ptr, scratch_level_1_size) - , m_league_rank( arg_league_rank ) - , m_league_size( arg_league_size ) + , m_league_rank( arg_league_rank ) + , m_league_size( arg_league_size ) {} -#else - - const execution_space::scratch_memory_space & team_shmem() const - { return m_team_shared.set_team_thread_mode(0, 1,0) ; } - const execution_space::scratch_memory_space & team_scratch(const int& level) const - { return m_team_shared.set_team_thread_mode(level,1,0) ; } - const execution_space::scratch_memory_space & thread_scratch(const int& level) const - { return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; } - - int league_rank() const {return 0;} - int league_size() const {return 1;} - int team_rank() const {return 0;} - int team_size() const {return 1;} - - void team_barrier() const {} - template - void team_broadcast(ValueType& value, const int& thread_id) const {} - - template< class JoinOp > - typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value - , const JoinOp & op ) const {return typename JoinOp::value_type();} - - template< typename Type > - Type team_scan( const Type & value , Type * const global_accum ) const {return Type();} - - template< typename Type > - Type team_scan( const Type & value ) const {return Type();} - - //---------------------------------------- - // Private for the driver - - CudaTeamMember( void * shared - , const int shared_begin - , const int shared_end - , void* scratch_level_1_ptr - , const int scratch_level_1_size - , const int arg_league_rank - , const int arg_league_size ); - -#endif /* #if ! defined( __CUDA_ARCH__ ) */ - }; } // namespace Impl @@ -356,7 +328,7 @@ public: , m_vector_length( 0 ) , m_team_scratch_size {0,0} , m_thread_scratch_size {0,0} - , m_chunk_size ( 32 ) + , m_chunk_size ( 32 ) {} /** \brief Specify league size, request team size */ @@ -508,7 +480,7 @@ private: typedef typename Policy::work_tag WorkTag ; const FunctorType m_functor ; - const Policy m_policy ; + const Policy m_policy ; ParallelFor() = delete ; ParallelFor & operator = ( const ParallelFor & ) = delete ; @@ -638,8 +610,8 @@ public: } - ParallelFor( const FunctorType & arg_functor - , const Policy & arg_policy + ParallelFor( const FunctorType & arg_functor + , const Policy & arg_policy ) : m_functor( arg_functor ) , m_league_size( arg_policy.league_size() ) @@ -680,7 +652,7 @@ template< class FunctorType , class ReducerType, class ... Traits > class ParallelReduce< FunctorType , Kokkos::RangePolicy< Traits ... > , ReducerType - , Kokkos::Cuda + , Kokkos::Cuda > { private: @@ -835,23 +807,22 @@ public: const int nwork = m_policy.end() - m_policy.begin(); if ( nwork ) { const int block_size = local_block_size( m_functor ); - + m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ ); m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) ); m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) ); - + // REQUIRED ( 1 , N , 1 ) const dim3 block( 1 , block_size , 1 ); // Required grid.x <= block.y const dim3 grid( std::min( int(block.y) , int( ( nwork + block.y - 1 ) / block.y ) ) , 1 , 1 ); - + const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem( m_functor , block.y ); - CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute - + Cuda::fence(); - + if ( m_result_ptr ) { if ( m_unified_space ) { const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); @@ -871,8 +842,8 @@ public: } template< class HostViewType > - ParallelReduce( const FunctorType & arg_functor - , const Policy & arg_policy + ParallelReduce( const FunctorType & arg_functor + , const Policy & arg_policy , const HostViewType & arg_result , typename std::enable_if< Kokkos::is_view< HostViewType >::value @@ -925,7 +896,6 @@ private: typedef typename ValueTraits::reference_type reference_type ; typedef typename ValueTraits::value_type value_type ; - public: typedef FunctorType functor_type ; @@ -937,7 +907,6 @@ private: typedef double DummyShflReductionType; typedef int DummySHMEMReductionType; - // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1 // shared memory utilization: // @@ -1058,36 +1027,44 @@ public: inline void execute() { - const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) ) - :std::min( m_league_size , m_team_size ); + const int nwork = m_league_size * m_team_size ; + if ( nwork ) { + const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) ) + :std::min( m_league_size , m_team_size ); - m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count ); - m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) ); - m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) ); + m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count ); + m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) ); + m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) ); - const dim3 block( m_vector_size , m_team_size , 1 ); - const dim3 grid( block_count , 1 , 1 ); - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ; + const dim3 block( m_vector_size , m_team_size , 1 ); + const dim3 grid( block_count , 1 , 1 ); + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ; - CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute + CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute - Cuda::fence(); + Cuda::fence(); - if ( m_result_ptr ) { - if ( m_unified_space ) { - const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); - for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; } + if ( m_result_ptr ) { + if ( m_unified_space ) { + const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); + for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; } + } + else { + const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ); + DeepCopy( m_result_ptr, m_scratch_space, size ); + } } - else { - const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ); - DeepCopy( m_result_ptr, m_scratch_space, size ); + } + else { + if (m_result_ptr) { + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr ); } } } template< class HostViewType > - ParallelReduce( const FunctorType & arg_functor - , const Policy & arg_policy + ParallelReduce( const FunctorType & arg_functor + , const Policy & arg_policy , const HostViewType & arg_result , typename std::enable_if< Kokkos::is_view< HostViewType >::value @@ -1106,9 +1083,18 @@ public: , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() : Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / - arg_policy.vector_length() ) + arg_policy.vector_length() ) , m_vector_size( arg_policy.vector_length() ) - , m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)} + , m_scratch_size{ + arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() : + Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(), + arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / + arg_policy.vector_length() ) + ), arg_policy.scratch_size(1,( 0 <= arg_policy.team_size() ? arg_policy.team_size() : + Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(), + arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / + arg_policy.vector_length() ) + )} { // Return Init value if the number of worksets is zero if( arg_policy.league_size() == 0) { @@ -1342,7 +1328,7 @@ private: } // Scan block values into locations shared_data[1..blockDim.y] - cuda_intra_block_reduce_scan( m_functor , ValueTraits::pointer_type(shared_data+word_count.value) ); + cuda_intra_block_reduce_scan( m_functor , typename ValueTraits::pointer_type(shared_data+word_count.value) ); { size_type * const block_total = shared_data + word_count.value * blockDim.y ; @@ -1391,32 +1377,32 @@ public: const int nwork = m_policy.end() - m_policy.begin(); if ( nwork ) { enum { GridMaxComputeCapability_2x = 0x0ffff }; - + const int block_size = local_block_size( m_functor ); - + const int grid_max = ( block_size * block_size ) < GridMaxComputeCapability_2x ? ( block_size * block_size ) : GridMaxComputeCapability_2x ; - + // At most 'max_grid' blocks: const int max_grid = std::min( int(grid_max) , int(( nwork + block_size - 1 ) / block_size )); - + // How much work per block: const int work_per_block = ( nwork + max_grid - 1 ) / max_grid ; - + // How many block are really needed for this much work: const int grid_x = ( nwork + work_per_block - 1 ) / work_per_block ; - + m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( m_functor ) * grid_x ); m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 ); - + const dim3 grid( grid_x , 1 , 1 ); const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 ) const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 ); - + m_final = false ; CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute - + m_final = true ; CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute } @@ -1490,18 +1476,30 @@ namespace Impl { #ifdef __CUDA_ARCH__ __device__ inline - ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread, const iType& count): + ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count): start( threadIdx.x ), end( count ), increment( blockDim.x ) {} + __device__ inline + ThreadVectorRangeBoundariesStruct (const iType& count): + start( threadIdx.x ), + end( count ), + increment( blockDim.x ) + {} #else KOKKOS_INLINE_FUNCTION - ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count): + ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count): start( 0 ), end( count ), increment( 1 ) {} + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct (const iType& count): + start( 0 ), + end( count ), + increment( 1 ) + {} #endif }; @@ -1509,22 +1507,24 @@ namespace Impl { template KOKKOS_INLINE_FUNCTION -Impl::TeamThreadRangeBoundariesStruct - TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& count) { - return Impl::TeamThreadRangeBoundariesStruct(thread,count); +Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember > +TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) { + return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count ); } -template +template< typename iType1, typename iType2 > KOKKOS_INLINE_FUNCTION -Impl::TeamThreadRangeBoundariesStruct - TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& begin, const iType& end) { - return Impl::TeamThreadRangeBoundariesStruct(thread,begin,end); +Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type, + Impl::CudaTeamMember > +TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) { + typedef typename std::common_type< iType1, iType2 >::type iType; + return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) ); } template KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct - ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) { +ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) { return Impl::ThreadVectorRangeBoundariesStruct(thread,count); } @@ -1571,9 +1571,10 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct::pointer_type const result, Cuda::size_type * const m_scratch_flags, const int max_active_thread = blockDim.y) { +#ifdef __CUDA_ARCH__ typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type; typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type; @@ -213,6 +214,9 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT //The last block has in its thread=0 the global reduction value through "value" return last_block; +#else + return true; +#endif } //---------------------------------------------------------------------------- @@ -290,10 +294,10 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor , if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ; - BLOCK_SCAN_STEP(tdata_inter,n,8) - BLOCK_SCAN_STEP(tdata_inter,n,7) - BLOCK_SCAN_STEP(tdata_inter,n,6) - BLOCK_SCAN_STEP(tdata_inter,n,5) + __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,8) + __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,7) + __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,6) + __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,5) } } } @@ -308,12 +312,19 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor , ( rtid_intra & 16 ) ? 16 : 0 )))); if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ; - + #ifdef KOKKOS_CUDA_CLANG_WORKAROUND + BLOCK_SCAN_STEP(tdata_intra,n,4) __syncthreads();//__threadfence_block(); + BLOCK_SCAN_STEP(tdata_intra,n,3) __syncthreads();//__threadfence_block(); + BLOCK_SCAN_STEP(tdata_intra,n,2) __syncthreads();//__threadfence_block(); + BLOCK_SCAN_STEP(tdata_intra,n,1) __syncthreads();//__threadfence_block(); + BLOCK_SCAN_STEP(tdata_intra,n,0) __syncthreads(); + #else BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block(); BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block(); BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block(); BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block(); - BLOCK_SCAN_STEP(tdata_intra,n,0) + BLOCK_SCAN_STEP(tdata_intra,n,0) __threadfence_block(); + #endif } #undef BLOCK_SCAN_STEP diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp index 701d267e1b..d56de5db60 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp @@ -43,7 +43,7 @@ #include -#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) +#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG ) #include @@ -174,6 +174,6 @@ printf("cuda_task_queue_execute after\n"); //---------------------------------------------------------------------------- -#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */ +#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG ) */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index 9d9347cc8d..479294f307 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -44,7 +44,7 @@ #ifndef KOKKOS_IMPL_CUDA_TASK_HPP #define KOKKOS_IMPL_CUDA_TASK_HPP -#if defined( KOKKOS_ENABLE_TASKPOLICY ) +#if defined( KOKKOS_ENABLE_TASKDAG ) //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -99,7 +99,7 @@ public: extern template class TaskQueue< Kokkos::Cuda > ; //---------------------------------------------------------------------------- -/**\brief Impl::TaskExec is the TaskPolicy::member_type +/**\brief Impl::TaskExec is the TaskScheduler::member_type * passed to tasks running in a Cuda space. * * Cuda thread blocks for tasking are dimensioned: @@ -234,19 +234,23 @@ namespace Kokkos { template KOKKOS_INLINE_FUNCTION -Impl::TeamThreadRangeBoundariesStruct > -TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread - , const iType & count ) +Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > > +TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count ) { - return Impl::TeamThreadRangeBoundariesStruct >(thread,count); + return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count ); } -template +template KOKKOS_INLINE_FUNCTION -Impl::TeamThreadRangeBoundariesStruct > -TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end ) +Impl::TeamThreadRangeBoundariesStruct + < typename std::common_type::type + , Impl::TaskExec< Kokkos::Cuda > > +TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread + , const iType1 & begin, const iType2 & end ) { - return Impl::TeamThreadRangeBoundariesStruct >(thread,start,end); + typedef typename std::common_type< iType1, iType2 >::type iType; + return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( + thread, iType(begin), iType(end) ); } template @@ -315,7 +319,7 @@ ValueType shfl_warp_broadcast } // all-reduce across corresponding vector lanes between team members within warp -// assume vec_length*team_size == warp_size +// assume vec_length*team_size == warp_size // blockDim.x == vec_length == stride // blockDim.y == team_size // threadIdx.x == position in vec @@ -344,7 +348,7 @@ void parallel_reduce // all-reduce across corresponding vector lanes between team members within warp // if no join() provided, use sum -// assume vec_length*team_size == warp_size +// assume vec_length*team_size == warp_size // blockDim.x == vec_length == stride // blockDim.y == team_size // threadIdx.x == position in vec @@ -372,7 +376,7 @@ void parallel_reduce } // all-reduce within team members within warp -// assume vec_length*team_size == warp_size +// assume vec_length*team_size == warp_size // blockDim.x == vec_length == stride // blockDim.y == team_size // threadIdx.x == position in vec @@ -397,7 +401,7 @@ void parallel_reduce // all-reduce within team members within warp // if no join() provided, use sum -// assume vec_length*team_size == warp_size +// assume vec_length*team_size == warp_size // blockDim.x == vec_length == stride // blockDim.y == team_size // threadIdx.x == position in vec @@ -426,7 +430,7 @@ void parallel_reduce } // scan across corresponding vector lanes between team members within warp -// assume vec_length*team_size == warp_size +// assume vec_length*team_size == warp_size // blockDim.x == vec_length == stride // blockDim.y == team_size // threadIdx.x == position in vec @@ -469,7 +473,7 @@ void parallel_scan } // scan within team member (vector) within warp -// assume vec_length*team_size == warp_size +// assume vec_length*team_size == warp_size // blockDim.x == vec_length == stride // blockDim.y == team_size // threadIdx.x == position in vec @@ -514,6 +518,6 @@ void parallel_scan //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp deleted file mode 100644 index bb3cd2640d..0000000000 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp +++ /dev/null @@ -1,932 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -// Experimental unified task-data parallel manycore LDRD - -#include -#include -#include -#include -#include - -#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) - -// #define DETAILED_PRINT - -//---------------------------------------------------------------------------- - -#define QLOCK reinterpret_cast( ~((uintptr_t)0) ) -#define QDENIED reinterpret_cast( ~((uintptr_t)0) - 1 ) - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -void CudaTaskPolicyQueue::Destroy::destroy_shared_allocation() -{ - // Verify the queue is empty - - if ( m_policy->m_count_ready || - m_policy->m_team[0] || - m_policy->m_team[1] || - m_policy->m_team[2] || - m_policy->m_serial[0] || - m_policy->m_serial[1] || - m_policy->m_serial[2] ) { - Kokkos::abort("CudaTaskPolicyQueue ERROR : Attempt to destroy non-empty queue" ); - } - - m_policy->~CudaTaskPolicyQueue(); - - Kokkos::Cuda::fence(); -} - -CudaTaskPolicyQueue:: -~CudaTaskPolicyQueue() -{ -} - -CudaTaskPolicyQueue:: -CudaTaskPolicyQueue - ( const unsigned arg_task_max_count - , const unsigned arg_task_max_size - , const unsigned arg_task_default_dependence_capacity - , const unsigned arg_team_size - ) - : m_space( Kokkos::CudaUVMSpace() - , arg_task_max_size * arg_task_max_count * 1.2 - , 16 /* log2(superblock size) */ - ) - , m_team { 0 , 0 , 0 } - , m_serial { 0 , 0 , 0 } - , m_team_size( 32 /* 1 warps */ ) - , m_default_dependence_capacity( arg_task_default_dependence_capacity ) - , m_count_ready(0) -{ - constexpr int max_team_size = 32 * 16 /* 16 warps */ ; - - const int target_team_size = - std::min( int(arg_team_size) , max_team_size ); - - while ( m_team_size < target_team_size ) { m_team_size *= 2 ; } -} - -//----------------------------------------------------------------------- -// Called by each block & thread - -__device__ -void Kokkos::Experimental::Impl::CudaTaskPolicyQueue::driver() -{ - task_root_type * const q_denied = reinterpret_cast(QDENIED); - -#define IS_TEAM_LEAD ( threadIdx.x == 0 && threadIdx.y == 0 ) - -#ifdef DETAILED_PRINT -if ( IS_TEAM_LEAD ) { - printf( "CudaTaskPolicyQueue::driver() begin on %d with count %d\n" - , blockIdx.x , m_count_ready ); -} -#endif - - // Each thread block must iterate this loop synchronously - // to insure team-execution of team-task - - __shared__ task_root_type * team_task ; - - __syncthreads(); - - do { - - if ( IS_TEAM_LEAD ) { - if ( 0 == m_count_ready ) { - team_task = q_denied ; // All queues are empty and no running tasks - } - else { - team_task = 0 ; - for ( int i = 0 ; i < int(NPRIORITY) && 0 == team_task ; ++i ) { - if ( ( i < 2 /* regular queue */ ) - || ( ! m_space.is_empty() /* waiting for memory */ ) ) { - team_task = pop_ready_task( & m_team[i] ); - } - } - } - } - - __syncthreads(); - -#ifdef DETAILED_PRINT -if ( IS_TEAM_LEAD && 0 != team_task ) { - printf( "CudaTaskPolicyQueue::driver() (%d) team_task(0x%lx)\n" - , blockIdx.x - , (unsigned long) team_task ); -} -#endif - - // team_task == q_denied if all queues are empty - // team_task == 0 if no team tasks available - - if ( q_denied != team_task ) { - if ( 0 != team_task ) { - - Kokkos::Impl::CudaTeamMember - member( kokkos_impl_cuda_shared_memory() - , 16 /* shared_begin */ - , team_task->m_shmem_size /* shared size */ - , 0 /* scratch level 1 pointer */ - , 0 /* scratch level 1 size */ - , 0 /* league rank */ - , 1 /* league size */ - ); - - (*team_task->m_team)( team_task , member ); - - // A __synthreads was called and if completed the - // functor was destroyed. - - if ( IS_TEAM_LEAD ) { - complete_executed_task( team_task ); - } - } - else { - // One thread of one warp performs this serial task - if ( threadIdx.x == 0 && - 0 == ( threadIdx.y % 32 ) ) { - task_root_type * task = 0 ; - for ( int i = 0 ; i < int(NPRIORITY) && 0 == task ; ++i ) { - if ( ( i < 2 /* regular queue */ ) - || ( ! m_space.is_empty() /* waiting for memory */ ) ) { - task = pop_ready_task( & m_serial[i] ); - } - } - -#ifdef DETAILED_PRINT -if ( 0 != task ) { - printf( "CudaTaskPolicyQueue::driver() (%2d)(%d) single task(0x%lx)\n" - , blockIdx.x - , threadIdx.y - , (unsigned long) task ); -} -#endif - - if ( task ) { - (*task->m_serial)( task ); - complete_executed_task( task ); - } - } - - __syncthreads(); - } - } - } while ( q_denied != team_task ); - -#ifdef DETAILED_PRINT -if ( IS_TEAM_LEAD ) { - printf( "CudaTaskPolicyQueue::driver() end on %d with count %d\n" - , blockIdx.x , m_count_ready ); -} -#endif - -#undef IS_TEAM_LEAD -} - -//----------------------------------------------------------------------- - -__device__ -CudaTaskPolicyQueue::task_root_type * -CudaTaskPolicyQueue::pop_ready_task( - CudaTaskPolicyQueue::task_root_type * volatile * const queue ) -{ - task_root_type * const q_lock = reinterpret_cast(QLOCK); - task_root_type * task = 0 ; - task_root_type * const task_claim = *queue ; - - if ( ( q_lock != task_claim ) && ( 0 != task_claim ) ) { - - // Queue is not locked and not null, try to claim head of queue. - // Is a race among threads to claim the queue. - - if ( task_claim == atomic_compare_exchange(queue,task_claim,q_lock) ) { - - // Aquired the task which must be in the waiting state. - - const int claim_state = - atomic_compare_exchange( & task_claim->m_state - , int(TASK_STATE_WAITING) - , int(TASK_STATE_EXECUTING) ); - - task_root_type * lock_verify = 0 ; - - if ( claim_state == int(TASK_STATE_WAITING) ) { - - // Transitioned this task from waiting to executing - // Update the queue to the next entry and release the lock - - task_root_type * const next = - *((task_root_type * volatile *) & task_claim->m_next ); - - *((task_root_type * volatile *) & task_claim->m_next ) = 0 ; - - lock_verify = atomic_compare_exchange( queue , q_lock , next ); - } - - if ( ( claim_state != int(TASK_STATE_WAITING) ) | - ( q_lock != lock_verify ) ) { - - printf( "CudaTaskPolicyQueue::pop_ready_task(0x%lx) task(0x%lx) state(%d) ERROR %s\n" - , (unsigned long) queue - , (unsigned long) task - , claim_state - , ( claim_state != int(TASK_STATE_WAITING) - ? "NOT WAITING" - : "UNLOCK" ) ); - Kokkos::abort("CudaTaskPolicyQueue::pop_ready_task"); - } - - task = task_claim ; - } - } - return task ; -} - -//----------------------------------------------------------------------- - -__device__ -void CudaTaskPolicyQueue::complete_executed_task( - CudaTaskPolicyQueue::task_root_type * task ) -{ - task_root_type * const q_denied = reinterpret_cast(QDENIED); - - -#ifdef DETAILED_PRINT -printf( "CudaTaskPolicyQueue::complete_executed_task(0x%lx) state(%d) (%d)(%d,%d)\n" - , (unsigned long) task - , task->m_state - , blockIdx.x - , threadIdx.x - , threadIdx.y - ); -#endif - - // State is either executing or if respawned then waiting, - // try to transition from executing to complete. - // Reads the current value. - - const int state_old = - atomic_compare_exchange( & task->m_state - , int(Kokkos::Experimental::TASK_STATE_EXECUTING) - , int(Kokkos::Experimental::TASK_STATE_COMPLETE) ); - - if ( int(Kokkos::Experimental::TASK_STATE_WAITING) == state_old ) { - /* Task requested a respawn so reschedule it */ - schedule_task( task , false /* not initial spawn */ ); - } - else if ( int(Kokkos::Experimental::TASK_STATE_EXECUTING) == state_old ) { - /* Task is complete */ - - // Clear dependences of this task before locking wait queue - - task->clear_dependence(); - - // Stop other tasks from adding themselves to this task's wait queue. - // The wait queue is updated concurrently so guard with an atomic. - - task_root_type * wait_queue = *((task_root_type * volatile *) & task->m_wait ); - task_root_type * wait_queue_old = 0 ; - - do { - wait_queue_old = wait_queue ; - wait_queue = atomic_compare_exchange( & task->m_wait , wait_queue_old , q_denied ); - } while ( wait_queue_old != wait_queue ); - - // The task has been removed from ready queue and - // execution is complete so decrement the reference count. - // The reference count was incremented by the initial spawning. - // The task may be deleted if this was the last reference. - - task_root_type::assign( & task , 0 ); - - // Pop waiting tasks and schedule them - while ( wait_queue ) { - task_root_type * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ; - schedule_task( x , false /* not initial spawn */ ); - } - } - else { - printf( "CudaTaskPolicyQueue::complete_executed_task(0x%lx) ERROR state_old(%d) dep_size(%d)\n" - , (unsigned long)( task ) - , int(state_old) - , task->m_dep_size - ); - Kokkos::abort("CudaTaskPolicyQueue::complete_executed_task" ); - } - - // If the task was respawned it may have already been - // put in a ready queue and the count incremented. - // By decrementing the count last it will never go to zero - // with a ready or executing task. - - atomic_fetch_add( & m_count_ready , -1 ); -} - -__device__ -void TaskMember< Kokkos::Cuda , void , void >::latch_add( const int k ) -{ - typedef TaskMember< Kokkos::Cuda , void , void > task_root_type ; - - task_root_type * const q_denied = reinterpret_cast(QDENIED); - - const bool ok_input = 0 < k ; - - const int count = ok_input ? atomic_fetch_add( & m_dep_size , -k ) - k - : k ; - - const bool ok_count = 0 <= count ; - - const int state = 0 != count ? TASK_STATE_WAITING : - atomic_compare_exchange( & m_state - , TASK_STATE_WAITING - , TASK_STATE_COMPLETE ); - - const bool ok_state = state == TASK_STATE_WAITING ; - - if ( ! ok_count || ! ok_state ) { - printf( "CudaTaskPolicyQueue::latch_add[0x%lx](%d) ERROR %s %d\n" - , (unsigned long) this - , k - , ( ! ok_input ? "Non-positive input" : - ( ! ok_count ? "Negative count" : "Bad State" ) ) - , ( ! ok_input ? k : - ( ! ok_count ? count : state ) ) - ); - Kokkos::abort( "CudaTaskPolicyQueue::latch_add ERROR" ); - } - else if ( 0 == count ) { - // Stop other tasks from adding themselves to this latch's wait queue. - // The wait queue is updated concurrently so guard with an atomic. - - CudaTaskPolicyQueue & policy = *m_policy ; - task_root_type * wait_queue = *((task_root_type * volatile *) &m_wait); - task_root_type * wait_queue_old = 0 ; - - do { - wait_queue_old = wait_queue ; - wait_queue = atomic_compare_exchange( & m_wait , wait_queue_old , q_denied ); - } while ( wait_queue_old != wait_queue ); - - // Pop waiting tasks and schedule them - while ( wait_queue ) { - task_root_type * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ; - policy.schedule_task( x , false /* not initial spawn */ ); - } - } -} - -//---------------------------------------------------------------------------- - -void CudaTaskPolicyQueue::reschedule_task( - CudaTaskPolicyQueue::task_root_type * const task ) -{ - // Reschedule transitions from executing back to waiting. - const int old_state = - atomic_compare_exchange( & task->m_state - , int(TASK_STATE_EXECUTING) - , int(TASK_STATE_WAITING) ); - - if ( old_state != int(TASK_STATE_EXECUTING) ) { - - printf( "CudaTaskPolicyQueue::reschedule_task(0x%lx) ERROR state(%d)\n" - , (unsigned long) task - , old_state - ); - Kokkos::abort("CudaTaskPolicyQueue::reschedule" ); - } -} - -KOKKOS_FUNCTION -void CudaTaskPolicyQueue::schedule_task( - CudaTaskPolicyQueue::task_root_type * const task , - const bool initial_spawn ) -{ - task_root_type * const q_lock = reinterpret_cast(QLOCK); - task_root_type * const q_denied = reinterpret_cast(QDENIED); - - //---------------------------------------- - // State is either constructing or already waiting. - // If constructing then transition to waiting. - - { - const int old_state = atomic_compare_exchange( & task->m_state - , int(TASK_STATE_CONSTRUCTING) - , int(TASK_STATE_WAITING) ); - - // Head of linked list of tasks waiting on this task - task_root_type * const waitTask = - *((task_root_type * volatile const *) & task->m_wait ); - - // Member of linked list of tasks waiting on some other task - task_root_type * const next = - *((task_root_type * volatile const *) & task->m_next ); - - // An incomplete and non-executing task has: - // task->m_state == TASK_STATE_CONSTRUCTING or TASK_STATE_WAITING - // task->m_wait != q_denied - // task->m_next == 0 - // - if ( ( q_denied == waitTask ) || - ( 0 != next ) || - ( old_state != int(TASK_STATE_CONSTRUCTING) && - old_state != int(TASK_STATE_WAITING) ) ) { - printf( "CudaTaskPolicyQueue::schedule_task(0x%lx) STATE ERROR: state(%d) wait(0x%lx) next(0x%lx)\n" - , (unsigned long) task - , old_state - , (unsigned long) waitTask - , (unsigned long) next ); - Kokkos::abort("CudaTaskPolicyQueue::schedule" ); - } - } - - //---------------------------------------- - - if ( initial_spawn ) { - // The initial spawn of a task increments the reference count - // for the task's existence in either a waiting or ready queue - // until the task has completed. - // Completing the task's execution is the matching - // decrement of the reference count. - task_root_type::assign( 0 , task ); - } - - //---------------------------------------- - // Insert this task into a dependence task that is not complete. - // Push on to that task's wait queue. - - bool attempt_insert_in_queue = true ; - - task_root_type * volatile * queue = - task->m_dep_size ? & task->m_dep[0]->m_wait : (task_root_type **) 0 ; - - for ( int i = 0 ; attempt_insert_in_queue && ( 0 != queue ) ; ) { - - task_root_type * const head_value_old = *queue ; - - if ( q_denied == head_value_old ) { - // Wait queue is closed because task is complete, - // try again with the next dependence wait queue. - ++i ; - queue = i < task->m_dep_size ? & task->m_dep[i]->m_wait - : (task_root_type **) 0 ; - } - else { - - // Wait queue is open and not denied. - // Have exclusive access to this task. - // Assign m_next assuming a successfull insertion into the queue. - // Fence the memory assignment before attempting the CAS. - - *((task_root_type * volatile *) & task->m_next ) = head_value_old ; - - memory_fence(); - - // Attempt to insert this task into the queue. - // If fails then continue the attempt. - - attempt_insert_in_queue = - head_value_old != atomic_compare_exchange(queue,head_value_old,task); - } - } - - //---------------------------------------- - // All dependences are complete, insert into the ready list - - if ( attempt_insert_in_queue ) { - - // Increment the count of ready tasks. - // Count will be decremented when task is complete. - - atomic_fetch_add( & m_count_ready , 1 ); - - queue = task->m_queue ; - - while ( attempt_insert_in_queue ) { - - // A locked queue is being popped. - - task_root_type * const head_value_old = *queue ; - - if ( q_lock != head_value_old ) { - // Read the head of ready queue, - // if same as previous value then CAS locks the ready queue - - // Have exclusive access to this task, - // assign to head of queue, assuming successful insert - // Fence assignment before attempting insert. - *((task_root_type * volatile *) & task->m_next ) = head_value_old ; - - memory_fence(); - - attempt_insert_in_queue = - head_value_old != atomic_compare_exchange(queue,head_value_old,task); - } - } - } -} - -void CudaTaskPolicyQueue::deallocate_task - ( CudaTaskPolicyQueue::task_root_type * const task ) -{ - m_space.deallocate( task , task->m_size_alloc ); -} - -KOKKOS_FUNCTION -CudaTaskPolicyQueue::task_root_type * -CudaTaskPolicyQueue::allocate_task - ( const unsigned arg_sizeof_task - , const unsigned arg_dep_capacity - , const unsigned arg_team_shmem - ) -{ - const unsigned base_size = arg_sizeof_task + - ( arg_sizeof_task % sizeof(task_root_type*) - ? sizeof(task_root_type*) - arg_sizeof_task % sizeof(task_root_type*) - : 0 ); - - const unsigned dep_capacity - = ~0u == arg_dep_capacity - ? m_default_dependence_capacity - : arg_dep_capacity ; - - const unsigned size_alloc = - base_size + sizeof(task_root_type*) * dep_capacity ; - - task_root_type * const task = - reinterpret_cast( m_space.allocate( size_alloc ) ); - - if ( task != 0 ) { - - // Initialize task's root and value data structure - // Calling function must copy construct the functor. - - new( (void*) task ) task_root_type(); - - task->m_policy = this ; - task->m_size_alloc = size_alloc ; - task->m_dep_capacity = dep_capacity ; - task->m_shmem_size = arg_team_shmem ; - - if ( dep_capacity ) { - task->m_dep = - reinterpret_cast( - reinterpret_cast(task) + base_size ); - - for ( unsigned i = 0 ; i < dep_capacity ; ++i ) - task->task_root_type::m_dep[i] = 0 ; - } - } - return task ; -} - -//---------------------------------------------------------------------------- - -void CudaTaskPolicyQueue::add_dependence - ( CudaTaskPolicyQueue::task_root_type * const after - , CudaTaskPolicyQueue::task_root_type * const before - ) -{ - if ( ( after != 0 ) && ( before != 0 ) ) { - - int const state = *((volatile const int *) & after->m_state ); - - // Only add dependence during construction or during execution. - // Both tasks must have the same policy. - // Dependence on non-full memory cannot be mixed with any other dependence. - - const bool ok_state = - Kokkos::Experimental::TASK_STATE_CONSTRUCTING == state || - Kokkos::Experimental::TASK_STATE_EXECUTING == state ; - - const bool ok_capacity = - after->m_dep_size < after->m_dep_capacity ; - - const bool ok_policy = - after->m_policy == this && before->m_policy == this ; - - if ( ok_state && ok_capacity && ok_policy ) { - - ++after->m_dep_size ; - - task_root_type::assign( after->m_dep + (after->m_dep_size-1) , before ); - - memory_fence(); - } - else { - -printf( "CudaTaskPolicyQueue::add_dependence( 0x%lx , 0x%lx ) ERROR %s\n" - , (unsigned long) after - , (unsigned long) before - , ( ! ok_state ? "Task not constructing or executing" : - ( ! ok_capacity ? "Task Exceeded dependence capacity" - : "Tasks from different policies" )) ); - - Kokkos::abort("CudaTaskPolicyQueue::add_dependence ERROR"); - } - } -} - -} /* namespace Impl */ -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -TaskPolicy< Kokkos::Cuda >::TaskPolicy - ( const unsigned arg_task_max_count - , const unsigned arg_task_max_size - , const unsigned arg_task_default_dependence_capacity - , const unsigned arg_task_team_size - ) - : m_track() - , m_policy(0) -{ - // Allocate the queue data sructure in UVM space - - typedef Kokkos::Experimental::Impl::SharedAllocationRecord - < Kokkos::CudaUVMSpace , Impl::CudaTaskPolicyQueue::Destroy > record_type ; - - record_type * record = - record_type::allocate( Kokkos::CudaUVMSpace() - , "CudaUVM task queue" - , sizeof(Impl::CudaTaskPolicyQueue) - ); - - m_policy = reinterpret_cast< Impl::CudaTaskPolicyQueue * >( record->data() ); - - // Tasks are allocated with application's task size + sizeof(task_root_type) - - const size_t full_task_size_estimate = - arg_task_max_size + - sizeof(task_root_type) + - sizeof(task_root_type*) * arg_task_default_dependence_capacity ; - - new( m_policy ) - Impl::CudaTaskPolicyQueue( arg_task_max_count - , full_task_size_estimate - , arg_task_default_dependence_capacity - , arg_task_team_size ); - - record->m_destroy.m_policy = m_policy ; - - m_track.assign_allocated_record_to_uninitialized( record ); -} - -__global__ -static void kokkos_cuda_task_policy_queue_driver - ( Kokkos::Experimental::Impl::CudaTaskPolicyQueue * queue ) -{ - queue->driver(); -} - -void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Cuda > & policy ) -{ - const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 ); - const dim3 block( 1 , policy.m_policy->m_team_size , 1 ); - - const int shared = 0 ; // Kokkos::Impl::CudaTraits::SharedMemoryUsage / 2 ; - const cudaStream_t stream = 0 ; - - -#ifdef DETAILED_PRINT -printf("kokkos_cuda_task_policy_queue_driver grid(%d,%d,%d) block(%d,%d,%d) shared(%d) policy(0x%lx)\n" - , grid.x , grid.y , grid.z - , block.x , block.y , block.z - , shared - , (unsigned long)( policy.m_policy ) ); -fflush(stdout); -#endif - - CUDA_SAFE_CALL( cudaDeviceSynchronize() ); - -/* - CUDA_SAFE_CALL( - cudaFuncSetCacheConfig( kokkos_cuda_task_policy_queue_driver - , cudaFuncCachePreferL1 ) ); - - CUDA_SAFE_CALL( cudaGetLastError() ); -*/ - - kokkos_cuda_task_policy_queue_driver<<< grid , block , shared , stream >>> - ( policy.m_policy ); - - CUDA_SAFE_CALL( cudaGetLastError() ); - - CUDA_SAFE_CALL( cudaDeviceSynchronize() ); - -#ifdef DETAILED_PRINT -printf("kokkos_cuda_task_policy_queue_driver end\n"); -fflush(stdout); -#endif - -} - -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -typedef TaskMember< Kokkos::Cuda , void , void > Task ; - -__host__ __device__ -Task::~TaskMember() -{ -} - -__host__ __device__ -void Task::assign( Task ** const lhs_ptr , Task * rhs ) -{ - Task * const q_denied = reinterpret_cast(QDENIED); - - // Increment rhs reference count. - if ( rhs ) { atomic_fetch_add( & rhs->m_ref_count , 1 ); } - - if ( 0 == lhs_ptr ) return ; - - // Must have exclusive access to *lhs_ptr. - // Assign the pointer and retrieve the previous value. - // Cannot use atomic exchange since *lhs_ptr may be - // in Cuda register space. - -#if 0 - - Task * const old_lhs = *((Task*volatile*)lhs_ptr); - - *((Task*volatile*)lhs_ptr) = rhs ; - - Kokkos::memory_fence(); - -#else - - Task * const old_lhs = *lhs_ptr ; - - *lhs_ptr = rhs ; - -#endif - - if ( old_lhs && rhs && old_lhs->m_policy != rhs->m_policy ) { - Kokkos::abort( "Kokkos::Impl::TaskMember::assign ERROR different queues"); - } - - if ( old_lhs ) { - - Kokkos::memory_fence(); - - // Decrement former lhs reference count. - // If reference count is zero task must be complete, then delete task. - // Task is ready for deletion when wait == q_denied - - int const count = atomic_fetch_add( & (old_lhs->m_ref_count) , -1 ) - 1 ; - int const state = old_lhs->m_state ; - Task * const wait = *((Task * const volatile *) & old_lhs->m_wait ); - - const bool ok_count = 0 <= count ; - - // If count == 0 then will be deleting - // and must either be constructing or complete. - const bool ok_state = 0 < count ? true : - ( ( state == int(TASK_STATE_CONSTRUCTING) && wait == 0 ) || - ( state == int(TASK_STATE_COMPLETE) && wait == q_denied ) ) - && - old_lhs->m_next == 0 && - old_lhs->m_dep_size == 0 ; - - if ( ! ok_count || ! ok_state ) { - - printf( "%s Kokkos::Impl::TaskManager::assign ERROR deleting task(0x%lx) m_ref_count(%d) m_state(%d) m_wait(0x%ld)\n" -#if defined( KOKKOS_ACTIVE_EXECUTION_SPACE_CUDA ) - , "CUDA " -#else - , "HOST " -#endif - , (unsigned long) old_lhs - , count - , state - , (unsigned long) wait ); - Kokkos::abort( "Kokkos::Impl::TaskMember::assign ERROR deleting"); - } - - if ( count == 0 ) { - // When 'count == 0' this thread has exclusive access to 'old_lhs' - -#ifdef DETAILED_PRINT -printf( "Task::assign(...) old_lhs(0x%lx) deallocate\n" - , (unsigned long) old_lhs - ); -#endif - - old_lhs->m_policy->deallocate_task( old_lhs ); - } - } -} - -//---------------------------------------------------------------------------- - -__device__ -int Task::get_dependence() const -{ - return m_dep_size ; -} - -__device__ -Task * Task::get_dependence( int i ) const -{ - Task * const t = ((Task*volatile*)m_dep)[i] ; - - if ( Kokkos::Experimental::TASK_STATE_EXECUTING != m_state || i < 0 || m_dep_size <= i || 0 == t ) { - -printf( "TaskMember< Cuda >::get_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) dep[%d] = %lx }\n" - , (unsigned long) this - , m_state - , m_dep_size - , i - , (unsigned long) t - ); - - Kokkos::abort("TaskMember< Cuda >::get_dependence ERROR"); - } - - return t ; -} - -//---------------------------------------------------------------------------- - -__device__ __host__ -void Task::clear_dependence() -{ - for ( int i = m_dep_size - 1 ; 0 <= i ; --i ) { - assign( m_dep + i , 0 ); - } - - *((volatile int *) & m_dep_size ) = 0 ; - - memory_fence(); -} - -//---------------------------------------------------------------------------- - - -//---------------------------------------------------------------------------- - -} /* namespace Impl */ -} /* namespace Experimental */ -} /* namespace Kokkos */ - - -#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ - diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp deleted file mode 100644 index e71512f039..0000000000 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp +++ /dev/null @@ -1,833 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -// Experimental unified task-data parallel manycore LDRD - -#ifndef KOKKOS_CUDA_TASKPOLICY_HPP -#define KOKKOS_CUDA_TASKPOLICY_HPP - -#include -#include -#include - -#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) - -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -struct CudaTaskPolicyQueue ; - -/** \brief Base class for all Kokkos::Cuda tasks */ -template<> -class TaskMember< Kokkos::Cuda , void , void > { -public: - - template< class > friend class Kokkos::Experimental::TaskPolicy ; - friend struct CudaTaskPolicyQueue ; - - typedef void (* function_single_type) ( TaskMember * ); - typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::CudaTeamMember & ); - -private: - - CudaTaskPolicyQueue * m_policy ; - TaskMember * volatile * m_queue ; - function_team_type m_team ; ///< Apply function on CUDA - function_single_type m_serial ; ///< Apply function on CUDA - TaskMember ** m_dep ; ///< Dependences - TaskMember * m_wait ; ///< Linked list of tasks waiting on this task - TaskMember * m_next ; ///< Linked list of tasks waiting on a different task - int m_dep_capacity ; ///< Capacity of dependences - int m_dep_size ; ///< Actual count of dependences - int m_size_alloc ; - int m_shmem_size ; - int m_ref_count ; ///< Reference count - int m_state ; ///< State of the task - - - TaskMember( TaskMember && ) = delete ; - TaskMember( const TaskMember & ) = delete ; - TaskMember & operator = ( TaskMember && ) = delete ; - TaskMember & operator = ( const TaskMember & ) = delete ; - -protected: - - KOKKOS_INLINE_FUNCTION - TaskMember() - : m_policy(0) - , m_queue(0) - , m_team(0) - , m_serial(0) - , m_dep(0) - , m_wait(0) - , m_next(0) - , m_size_alloc(0) - , m_dep_capacity(0) - , m_dep_size(0) - , m_shmem_size(0) - , m_ref_count(0) - , m_state( TASK_STATE_CONSTRUCTING ) - {} - -public: - - KOKKOS_FUNCTION - ~TaskMember(); - - KOKKOS_INLINE_FUNCTION - int reference_count() const - { return *((volatile int *) & m_ref_count ); } - - // Cannot use the function pointer to verify the type - // since the function pointer is not unique between - // Host and Cuda. Don't run verificaton for Cuda. - // Assume testing on Host-only back-end will catch such errors. - - template< typename ResultType > - KOKKOS_INLINE_FUNCTION static - TaskMember * verify_type( TaskMember * t ) { return t ; } - - //---------------------------------------- - /* Inheritence Requirements on task types: - * - * class DerivedTaskType - * : public TaskMember< Cuda , DerivedType::value_type , FunctorType > - * { ... }; - * - * class TaskMember< Cuda , DerivedType::value_type , FunctorType > - * : public TaskMember< Cuda , DerivedType::value_type , void > - * , public Functor - * { ... }; - * - * If value_type != void - * class TaskMember< Cuda , value_type , void > - * : public TaskMember< Cuda , void , void > - * - * Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ] - * - */ - //---------------------------------------- - // If after the 'apply' the task's state is waiting - // then it will be rescheduled and called again. - // Otherwise the functor must be destroyed. - - template< class DerivedTaskType , class Tag > - __device__ static - void apply_single( - typename std::enable_if - <( std::is_same< Tag , void >::value && - std::is_same< typename DerivedTaskType::result_type , void >::value - ), TaskMember * >::type t ) - { - typedef typename DerivedTaskType::functor_type functor_type ; - - functor_type * const f = - static_cast< functor_type * >( static_cast< DerivedTaskType * >(t) ); - - f->apply(); - - if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) { - f->~functor_type(); - } - } - - template< class DerivedTaskType , class Tag > - __device__ static - void apply_single( - typename std::enable_if - <( std::is_same< Tag , void >::value && - ! std::is_same< typename DerivedTaskType::result_type , void >::value - ), TaskMember * >::type t ) - { - typedef typename DerivedTaskType::functor_type functor_type ; - - DerivedTaskType * const self = static_cast< DerivedTaskType * >(t); - functor_type * const f = static_cast< functor_type * >( self ); - - f->apply( self->m_result ); - - if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) { - f->~functor_type(); - } - } - - template< class DerivedTaskType , class Tag > - __device__ - void set_apply_single() - { - m_serial = & TaskMember::template apply_single ; - } - - //---------------------------------------- - - template< class DerivedTaskType , class Tag > - __device__ static - void apply_team( - typename std::enable_if - <( std::is_same::value && - std::is_same::value - ), TaskMember * >::type t - , Kokkos::Impl::CudaTeamMember & member - ) - { - typedef typename DerivedTaskType::functor_type functor_type ; - - functor_type * const f = - static_cast< functor_type * >( static_cast< DerivedTaskType * >(t) ); - - f->apply( member ); - - __syncthreads(); // Wait for team to finish calling function - - if ( threadIdx.x == 0 && - threadIdx.y == 0 && - t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) { - f->~functor_type(); - } - } - - template< class DerivedTaskType , class Tag > - __device__ static - void apply_team( - typename std::enable_if - <( std::is_same::value && - ! std::is_same::value - ), TaskMember * >::type t - , Kokkos::Impl::CudaTeamMember & member - ) - { - typedef typename DerivedTaskType::functor_type functor_type ; - - DerivedTaskType * const self = static_cast< DerivedTaskType * >(t); - functor_type * const f = static_cast< functor_type * >( self ); - - f->apply( member , self->m_result ); - - __syncthreads(); // Wait for team to finish calling function - - if ( threadIdx.x == 0 && - threadIdx.y == 0 && - t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) { - f->~functor_type(); - } - } - - template< class DerivedTaskType , class Tag > - __device__ - void set_apply_team() - { - m_team = & TaskMember::template apply_team ; - } - - //---------------------------------------- - - KOKKOS_FUNCTION static - void assign( TaskMember ** const lhs , TaskMember * const rhs ); - - __device__ - TaskMember * get_dependence( int i ) const ; - - __device__ - int get_dependence() const ; - - KOKKOS_FUNCTION void clear_dependence(); - - __device__ - void latch_add( const int k ); - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION static - void construct_result( TaskMember * const ) {} - - typedef FutureValueTypeIsVoidError get_result_type ; - - KOKKOS_INLINE_FUNCTION - get_result_type get() const { return get_result_type() ; } - - KOKKOS_INLINE_FUNCTION - Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); } - -}; - -/** \brief A Future< Kokkos::Cuda , ResultType > will cast - * from TaskMember< Kokkos::Cuda , void , void > - * to TaskMember< Kokkos::Cuda , ResultType , void > - * to query the result. - */ -template< class ResultType > -class TaskMember< Kokkos::Cuda , ResultType , void > - : public TaskMember< Kokkos::Cuda , void , void > -{ -public: - - typedef ResultType result_type ; - - result_type m_result ; - - typedef const result_type & get_result_type ; - - KOKKOS_INLINE_FUNCTION - get_result_type get() const { return m_result ; } - - KOKKOS_INLINE_FUNCTION static - void construct_result( TaskMember * const ptr ) - { - new((void*)(& ptr->m_result)) result_type(); - } - - TaskMember() = delete ; - TaskMember( TaskMember && ) = delete ; - TaskMember( const TaskMember & ) = delete ; - TaskMember & operator = ( TaskMember && ) = delete ; - TaskMember & operator = ( const TaskMember & ) = delete ; -}; - -/** \brief Callback functions will cast - * from TaskMember< Kokkos::Cuda , void , void > - * to TaskMember< Kokkos::Cuda , ResultType , FunctorType > - * to execute work functions. - */ -template< class ResultType , class FunctorType > -class TaskMember< Kokkos::Cuda , ResultType , FunctorType > - : public TaskMember< Kokkos::Cuda , ResultType , void > - , public FunctorType -{ -public: - typedef ResultType result_type ; - typedef FunctorType functor_type ; - - KOKKOS_INLINE_FUNCTION static - void copy_construct( TaskMember * const ptr - , const functor_type & arg_functor ) - { - typedef TaskMember< Kokkos::Cuda , ResultType , void > base_type ; - - new((void*)static_cast(ptr)) functor_type( arg_functor ); - - base_type::construct_result( static_cast( ptr ) ); - } - - TaskMember() = delete ; - TaskMember( TaskMember && ) = delete ; - TaskMember( const TaskMember & ) = delete ; - TaskMember & operator = ( TaskMember && ) = delete ; - TaskMember & operator = ( const TaskMember & ) = delete ; -}; - -//---------------------------------------------------------------------------- - -namespace { - -template< class DerivedTaskType , class Tag > -__global__ -void cuda_set_apply_single( DerivedTaskType * task ) -{ - typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void > - task_root_type ; - - task->task_root_type::template set_apply_single< DerivedTaskType , Tag >(); -} - -template< class DerivedTaskType , class Tag > -__global__ -void cuda_set_apply_team( DerivedTaskType * task ) -{ - typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void > - task_root_type ; - - task->task_root_type::template set_apply_team< DerivedTaskType , Tag >(); -} - -} /* namespace */ -} /* namespace Impl */ -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -struct CudaTaskPolicyQueue { - - enum { NPRIORITY = 3 }; - - // Must use UVM so that tasks can be created in both - // Host and Cuda space. - - typedef Kokkos::Experimental::MemoryPool< Kokkos::CudaUVMSpace > - memory_space ; - - typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void > - task_root_type ; - - memory_space m_space ; - task_root_type * m_team[ NPRIORITY ] ; - task_root_type * m_serial[ NPRIORITY ]; - int m_team_size ; - int m_default_dependence_capacity ; - int volatile m_count_ready ; ///< Ready plus executing tasks - - // Execute tasks until all non-waiting tasks are complete - __device__ - void driver(); - - __device__ static - task_root_type * pop_ready_task( task_root_type * volatile * const queue ); - - // When a task finishes executing. - __device__ - void complete_executed_task( task_root_type * ); - - KOKKOS_FUNCTION void schedule_task( task_root_type * const - , const bool initial_spawn = true ); - KOKKOS_FUNCTION void reschedule_task( task_root_type * const ); - KOKKOS_FUNCTION - void add_dependence( task_root_type * const after - , task_root_type * const before ); - - - CudaTaskPolicyQueue() = delete ; - CudaTaskPolicyQueue( CudaTaskPolicyQueue && ) = delete ; - CudaTaskPolicyQueue( const CudaTaskPolicyQueue & ) = delete ; - CudaTaskPolicyQueue & operator = ( CudaTaskPolicyQueue && ) = delete ; - CudaTaskPolicyQueue & operator = ( const CudaTaskPolicyQueue & ) = delete ; - - - ~CudaTaskPolicyQueue(); - - // Construct only on the Host - CudaTaskPolicyQueue - ( const unsigned arg_task_max_count - , const unsigned arg_task_max_size - , const unsigned arg_task_default_dependence_capacity - , const unsigned arg_task_team_size - ); - - struct Destroy { - CudaTaskPolicyQueue * m_policy ; - void destroy_shared_allocation(); - }; - - //---------------------------------------- - /** \brief Allocate and construct a task. - * - * Allocate space for DerivedTaskType followed - * by TaskMember*[ dependence_capacity ] - */ - KOKKOS_FUNCTION - task_root_type * - allocate_task( const unsigned arg_sizeof_task - , const unsigned arg_dep_capacity - , const unsigned arg_team_shmem = 0 ); - - KOKKOS_FUNCTION void deallocate_task( task_root_type * const ); -}; - -} /* namespace Impl */ -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -void wait( TaskPolicy< Kokkos::Cuda > & ); - -template<> -class TaskPolicy< Kokkos::Cuda > -{ -public: - - typedef Kokkos::Cuda execution_space ; - typedef TaskPolicy execution_policy ; - typedef Kokkos::Impl::CudaTeamMember member_type ; - -private: - - typedef Impl::TaskMember< Kokkos::Cuda , void , void > task_root_type ; - typedef Kokkos::Experimental::MemoryPool< Kokkos::CudaUVMSpace > memory_space ; - typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ; - - track_type m_track ; - Impl::CudaTaskPolicyQueue * m_policy ; - - template< class FunctorType > - KOKKOS_INLINE_FUNCTION static - const task_root_type * get_task_root( const FunctorType * f ) - { - typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ; - return static_cast< const task_root_type * >( static_cast< const task_type * >(f) ); - } - - template< class FunctorType > - KOKKOS_INLINE_FUNCTION static - task_root_type * get_task_root( FunctorType * f ) - { - typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ; - return static_cast< task_root_type * >( static_cast< task_type * >(f) ); - } - -public: - - TaskPolicy - ( const unsigned arg_task_max_count - , const unsigned arg_task_max_size - , const unsigned arg_task_default_dependence_capacity = 4 - , const unsigned arg_task_team_size = 0 /* choose default */ - ); - - KOKKOS_FUNCTION TaskPolicy() = default ; - KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ; - KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ; - KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ; - KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ; - - KOKKOS_FUNCTION - int allocated_task_count() const { return 0 ; } - - //---------------------------------------- - // Create serial-thread task - // Main process and tasks must use different functions - // to work around CUDA limitation where __host__ __device__ - // functions are not allowed to invoke templated __global__ functions. - - template< class FunctorType > - Future< typename FunctorType::value_type , execution_space > - proc_create( const FunctorType & arg_functor - , const unsigned arg_dep_capacity = ~0u ) const - { - typedef typename FunctorType::value_type value_type ; - - typedef Impl::TaskMember< execution_space , value_type , FunctorType > - task_type ; - - task_type * const task = - static_cast( - m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity ) ); - - if ( task ) { - // The root part of the class has been constructed. - // Must now construct the functor and result specific part. - - task_type::copy_construct( task , arg_functor ); - - // Setting the apply pointer on the device requires code - // executing on the GPU. This function is called on the - // host process so a kernel must be run. - - // Launching a kernel will cause the allocated task in - // UVM memory to be copied to the GPU. - // Synchronize to guarantee non-concurrent access - // between host and device. - - CUDA_SAFE_CALL( cudaDeviceSynchronize() ); - - Impl::cuda_set_apply_single<<<1,1>>>( task ); - - CUDA_SAFE_CALL( cudaGetLastError() ); - CUDA_SAFE_CALL( cudaDeviceSynchronize() ); - } - - return Future< value_type , execution_space >( task ); - } - - template< class FunctorType > - __device__ - Future< typename FunctorType::value_type , execution_space > - task_create( const FunctorType & arg_functor - , const unsigned arg_dep_capacity = ~0u ) const - { - typedef typename FunctorType::value_type value_type ; - - typedef Impl::TaskMember< execution_space , value_type , FunctorType > - task_type ; - - task_type * const task = - static_cast( - m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity ) ); - - if ( task ) { - // The root part of the class has been constructed. - // Must now construct the functor and result specific part. - - task_type::copy_construct( task , arg_functor ); - - // Setting the apply pointer on the device requires code - // executing on the GPU. If this function is called on the - // Host then a kernel must be run. - - task->task_root_type::template set_apply_single< task_type , void >(); - } - - return Future< value_type , execution_space >( task ); - } - - //---------------------------------------- - // Create thread-team task - // Main process and tasks must use different functions - // to work around CUDA limitation where __host__ __device__ - // functions are not allowed to invoke templated __global__ functions. - - template< class FunctorType > - Future< typename FunctorType::value_type , execution_space > - proc_create_team( const FunctorType & arg_functor - , const unsigned arg_dep_capacity = ~0u ) const - { - typedef typename FunctorType::value_type value_type ; - - typedef Impl::TaskMember< execution_space , value_type , FunctorType > - task_type ; - - const unsigned team_shmem_size = - Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value - ( arg_functor , m_policy->m_team_size ); - - task_type * const task = - static_cast( - m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity , team_shmem_size ) ); - - if ( task ) { - // The root part of the class has been constructed. - // Must now construct the functor and result specific part. - - task_type::copy_construct( task , arg_functor ); - - // Setting the apply pointer on the device requires code - // executing on the GPU. This function is called on the - // host process so a kernel must be run. - - // Launching a kernel will cause the allocated task in - // UVM memory to be copied to the GPU. - // Synchronize to guarantee non-concurrent access - // between host and device. - - CUDA_SAFE_CALL( cudaDeviceSynchronize() ); - - Impl::cuda_set_apply_team<<<1,1>>>( task ); - - CUDA_SAFE_CALL( cudaGetLastError() ); - CUDA_SAFE_CALL( cudaDeviceSynchronize() ); - } - - return Future< value_type , execution_space >( task ); - } - - template< class FunctorType > - __device__ - Future< typename FunctorType::value_type , execution_space > - task_create_team( const FunctorType & arg_functor - , const unsigned arg_dep_capacity = ~0u ) const - { - typedef typename FunctorType::value_type value_type ; - - typedef Impl::TaskMember< execution_space , value_type , FunctorType > - task_type ; - - const unsigned team_shmem_size = - Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value - ( arg_functor , m_policy->m_team_size ); - - task_type * const task = - static_cast( - m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity , team_shmem_size ) ); - - if ( task ) { - // The root part of the class has been constructed. - // Must now construct the functor and result specific part. - - task_type::copy_construct( task , arg_functor ); - - // Setting the apply pointer on the device requires code - // executing on the GPU. If this function is called on the - // Host then a kernel must be run. - - task->task_root_type::template set_apply_team< task_type , void >(); - } - - return Future< value_type , execution_space >( task ); - } - - //---------------------------------------- - - Future< Latch , execution_space > - KOKKOS_INLINE_FUNCTION - create_latch( const int N ) const - { - task_root_type * const task = - m_policy->allocate_task( sizeof(task_root_type) , 0 , 0 ); - task->m_dep_size = N ; // Using m_dep_size for latch counter - task->m_state = TASK_STATE_WAITING ; - return Future< Latch , execution_space >( task ); - } - - //---------------------------------------- - - template< class A1 , class A2 , class A3 , class A4 > - KOKKOS_INLINE_FUNCTION - void add_dependence( const Future & after - , const Future & before - , typename std::enable_if - < std::is_same< typename Future::execution_space , execution_space >::value - && - std::is_same< typename Future::execution_space , execution_space >::value - >::type * = 0 - ) const - { m_policy->add_dependence( after.m_task , before.m_task ); } - - template< class FunctorType , class A3 , class A4 > - KOKKOS_INLINE_FUNCTION - void add_dependence( FunctorType * task_functor - , const Future & before - , typename std::enable_if - < std::is_same< typename Future::execution_space , execution_space >::value - >::type * = 0 - ) const - { m_policy->add_dependence( get_task_root(task_functor) , before.m_task ); } - - - template< class ValueType > - KOKKOS_INLINE_FUNCTION - const Future< ValueType , execution_space > & - spawn( const Future< ValueType , execution_space > & f - , const bool priority = false ) const - { - if ( f.m_task ) { - f.m_task->m_queue = - ( f.m_task->m_team != 0 - ? & ( m_policy->m_team[ priority ? 0 : 1 ] ) - : & ( m_policy->m_serial[ priority ? 0 : 1 ] ) ); - m_policy->schedule_task( f.m_task ); - } - return f ; - } - - template< class FunctorType > - KOKKOS_INLINE_FUNCTION - void respawn( FunctorType * task_functor - , const bool priority = false ) const - { - task_root_type * const t = get_task_root(task_functor); - t->m_queue = - ( t->m_team != 0 ? & ( m_policy->m_team[ priority ? 0 : 1 ] ) - : & ( m_policy->m_serial[ priority ? 0 : 1 ] ) ); - m_policy->reschedule_task( t ); - } - - // When a create method fails by returning a null Future - // the task that called the create method may respawn - // with a dependence on memory becoming available. - // This is a race as more than one task may be respawned - // with this need. - - template< class FunctorType > - KOKKOS_INLINE_FUNCTION - void respawn_needing_memory( FunctorType * task_functor ) const - { - task_root_type * const t = get_task_root(task_functor); - t->m_queue = - ( t->m_team != 0 ? & ( m_policy->m_team[ 2 ] ) - : & ( m_policy->m_serial[ 2 ] ) ); - m_policy->reschedule_task( t ); - } - - //---------------------------------------- - // Functions for an executing task functor to query dependences, - // set new dependences, and respawn itself. - - template< class FunctorType > - KOKKOS_INLINE_FUNCTION - Future< void , execution_space > - get_dependence( const FunctorType * task_functor , int i ) const - { - return Future( - get_task_root(task_functor)->get_dependence(i) - ); - } - - template< class FunctorType > - KOKKOS_INLINE_FUNCTION - int get_dependence( const FunctorType * task_functor ) const - { return get_task_root(task_functor)->get_dependence(); } - - template< class FunctorType > - KOKKOS_INLINE_FUNCTION - void clear_dependence( FunctorType * task_functor ) const - { get_task_root(task_functor)->clear_dependence(); } - - //---------------------------------------- - - __device__ - static member_type member_single() - { - return - member_type( 0 /* shared memory pointer */ - , 0 /* shared memory begin offset */ - , 0 /* shared memory end offset */ - , 0 /* scratch level_1 pointer */ - , 0 /* scratch level_1 size */ - , 0 /* league rank */ - , 1 /* league size */ ); - } - - friend void wait( TaskPolicy< Kokkos::Cuda > & ); -}; - -} /* namespace Experimental */ -} /* namespace Kokkos */ - - -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */ -#endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */ - - diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp index 92f6fc1f5f..b505b766a0 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp @@ -41,53 +41,266 @@ //@HEADER */ -#ifndef KOKKOS_CUDA_VIEW_HPP -#define KOKKOS_CUDA_VIEW_HPP - -#include +#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP +#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP /* only compile this file if CUDA is enabled for Kokkos */ -#ifdef KOKKOS_HAVE_CUDA - -#include - -#include -#include -#include -#include +#if defined( KOKKOS_HAVE_CUDA ) //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { +namespace Experimental { namespace Impl { -template<> -struct AssertShapeBoundsAbort< CudaSpace > -{ - KOKKOS_INLINE_FUNCTION - static void apply( const size_t /* rank */ , - const size_t /* n0 */ , const size_t /* n1 */ , - const size_t /* n2 */ , const size_t /* n3 */ , - const size_t /* n4 */ , const size_t /* n5 */ , - const size_t /* n6 */ , const size_t /* n7 */ , +// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4) +// Via reinterpret_case this can be used to support all scalar types of those sizes. +// Any other scalar type falls back to either normal reads out of global memory, +// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0) - const size_t /* arg_rank */ , - const size_t /* i0 */ , const size_t /* i1 */ , - const size_t /* i2 */ , const size_t /* i3 */ , - const size_t /* i4 */ , const size_t /* i5 */ , - const size_t /* i6 */ , const size_t /* i7 */ ) +template< typename ValueType , typename AliasType > +struct CudaTextureFetch { + + ::cudaTextureObject_t m_obj ; + const ValueType * m_ptr ; + int m_offset ; + + // Deference operator pulls through texture object and returns by value + template< typename iType > + KOKKOS_INLINE_FUNCTION + ValueType operator[]( const iType & i ) const { - Kokkos::abort("Kokkos::View array bounds violation"); +#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ ) + AliasType v = tex1Dfetch( m_obj , i + m_offset ); + return *(reinterpret_cast (&v)); +#else + return m_ptr[ i ]; +#endif + } + + // Pointer to referenced memory + KOKKOS_INLINE_FUNCTION + operator const ValueType * () const { return m_ptr ; } + + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {} + + KOKKOS_INLINE_FUNCTION + ~CudaTextureFetch() {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch( const CudaTextureFetch & rhs ) + : m_obj( rhs.m_obj ) + , m_ptr( rhs.m_ptr ) + , m_offset( rhs.m_offset ) + {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch( CudaTextureFetch && rhs ) + : m_obj( rhs.m_obj ) + , m_ptr( rhs.m_ptr ) + , m_offset( rhs.m_offset ) + {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) + { + m_obj = rhs.m_obj ; + m_ptr = rhs.m_ptr ; + m_offset = rhs.m_offset ; + return *this ; + } + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch & operator = ( CudaTextureFetch && rhs ) + { + m_obj = rhs.m_obj ; + m_ptr = rhs.m_ptr ; + m_offset = rhs.m_offset ; + return *this ; + } + + // Texture object spans the entire allocation. + // This handle may view a subset of the allocation, so an offset is required. + template< class CudaMemorySpace > + inline explicit + CudaTextureFetch( const ValueType * const arg_ptr + , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record + ) + : m_obj( record.template attach_texture_object< AliasType >() ) + , m_ptr( arg_ptr ) + , m_offset( record.attach_texture_object_offset( reinterpret_cast( arg_ptr ) ) ) + {} + + // Texture object spans the entire allocation. + // This handle may view a subset of the allocation, so an offset is required. + KOKKOS_INLINE_FUNCTION + CudaTextureFetch( const CudaTextureFetch & rhs , size_t offset ) + : m_obj( rhs.m_obj ) + , m_ptr( rhs.m_ptr + offset) + , m_offset( offset + rhs.m_offset ) + {} +}; + +#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC ) + +template< typename ValueType , typename AliasType > +struct CudaLDGFetch { + + const ValueType * m_ptr ; + + template< typename iType > + KOKKOS_INLINE_FUNCTION + ValueType operator[]( const iType & i ) const + { + #ifdef __CUDA_ARCH__ + AliasType v = __ldg(reinterpret_cast(&m_ptr[i])); + return *(reinterpret_cast (&v)); + #else + return m_ptr[i]; + #endif + } + + KOKKOS_INLINE_FUNCTION + operator const ValueType * () const { return m_ptr ; } + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch() : m_ptr() {} + + KOKKOS_INLINE_FUNCTION + ~CudaLDGFetch() {} + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch( const CudaLDGFetch & rhs ) + : m_ptr( rhs.m_ptr ) + {} + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch( CudaLDGFetch && rhs ) + : m_ptr( rhs.m_ptr ) + {} + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch & operator = ( const CudaLDGFetch & rhs ) + { + m_ptr = rhs.m_ptr ; + return *this ; + } + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch & operator = ( CudaLDGFetch && rhs ) + { + m_ptr = rhs.m_ptr ; + return *this ; + } + + template< class CudaMemorySpace > + inline explicit + CudaLDGFetch( const ValueType * const arg_ptr + , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const & + ) + : m_ptr( arg_ptr ) + {} + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch( CudaLDGFetch const rhs ,size_t offset) + : m_ptr( rhs.m_ptr + offset ) + {} + +}; + +#endif + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization + * if 'const' value type, CudaSpace and random access. + */ +template< class Traits > +class ViewDataHandle< Traits , + typename std::enable_if<( + // Is Cuda memory space + ( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value || + std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value ) + && + // Is a trivial const value of 4, 8, or 16 bytes + std::is_trivial::value + && + std::is_same::value + && + ( sizeof(typename Traits::const_value_type) == 4 || + sizeof(typename Traits::const_value_type) == 8 || + sizeof(typename Traits::const_value_type) == 16 ) + && + // Random access trait + ( Traits::memory_traits::RandomAccess != 0 ) + )>::type > +{ +public: + + using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ; + + using value_type = typename Traits::const_value_type ; + using return_type = typename Traits::const_value_type ; // NOT a reference + + using alias_type = typename std::conditional< ( sizeof(value_type) == 4 ) , int , + typename std::conditional< ( sizeof(value_type) == 8 ) , ::int2 , + typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void + >::type + >::type + >::type ; + +#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC ) + using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ; +#else + using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ; +#endif + + KOKKOS_INLINE_FUNCTION + static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ ) + { + return arg_handle ; + } + + KOKKOS_INLINE_FUNCTION + static handle_type const assign( handle_type const & arg_handle , size_t offset ) + { + return handle_type(arg_handle,offset) ; + } + + KOKKOS_INLINE_FUNCTION + static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker ) + { +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + // Assignment of texture = non-texture requires creation of a texture object + // which can only occur on the host. In addition, 'get_record' is only valid + // if called in a host execution space + return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() ); +#else + Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel"); + return handle_type(); +#endif } }; +} } } //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#endif // KOKKOS_HAVE_CUDA +#endif /* #if defined( KOKKOS_HAVE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp index deb955ccd4..60903b757f 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp @@ -47,18 +47,10 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- #include "Kokkos_Macros.hpp" -#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA ) +#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA ) #include -#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 ) -#error "Cuda version 4.1 or greater required" -#endif - -#if ( __CUDA_ARCH__ < 200 ) -#error "Cuda device capability 2.0 or greater required" -#endif - extern "C" { /* Cuda runtime function, declared in * Requires capability 2.x or better. @@ -90,30 +82,6 @@ void cuda_abort( const char * const message ) } // namespace Impl } // namespace Kokkos - -#else - -namespace Kokkos { -namespace Impl { -KOKKOS_INLINE_FUNCTION -void cuda_abort( const char * const ) {} -} -} - -#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) -namespace Kokkos { -__device__ inline -void abort( const char * const message ) { Kokkos::Impl::cuda_abort(message); } -} -#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - +#endif /* #if defined(__CUDACC__) && defined( KOKKOS_HAVE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp index 6d37d69a63..3102402b83 100644 --- a/lib/kokkos/core/src/Kokkos_Atomic.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp @@ -75,15 +75,16 @@ #if defined(_WIN32) #define KOKKOS_ATOMICS_USE_WINDOWS #else -#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA ) +#if defined( KOKKOS_HAVE_CUDA ) // Compiling NVIDIA device code, must use Cuda atomics: #define KOKKOS_ATOMICS_USE_CUDA +#endif -#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \ - ! defined( KOKKOS_ATOMICS_USE_INTEL ) && \ - ! defined( KOKKOS_ATOMICS_USE_OMP31 ) +#if ! defined( KOKKOS_ATOMICS_USE_GCC ) && \ + ! defined( KOKKOS_ATOMICS_USE_INTEL ) && \ + ! defined( KOKKOS_ATOMICS_USE_OMP31 ) // Compiling for non-Cuda atomic implementation has not been pre-selected. // Choose the best implementation for the detected compiler. @@ -91,7 +92,7 @@ #if defined( KOKKOS_COMPILER_GNU ) || \ defined( KOKKOS_COMPILER_CLANG ) || \ - ( defined ( KOKKOS_COMPILER_NVCC ) && defined ( __GNUC__ ) ) + ( defined ( KOKKOS_COMPILER_NVCC ) ) #define KOKKOS_ATOMICS_USE_GCC @@ -126,6 +127,9 @@ namespace Impl { /// This function tries to aquire the lock for the hash value derived /// from the provided ptr. If the lock is successfully aquired the /// function returns true. Otherwise it returns false. +#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE +extern +#endif __device__ inline bool lock_address_cuda_space(void* ptr); @@ -135,6 +139,9 @@ bool lock_address_cuda_space(void* ptr); /// from the provided ptr. This function should only be called /// after previously successfully aquiring a lock with /// lock_address. +#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE +extern +#endif __device__ inline void unlock_address_cuda_space(void* ptr); } @@ -287,7 +294,7 @@ const char * atomic_query_version() //---------------------------------------------------------------------------- // This atomic-style macro should be an inlined function, not a macro -#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__) +#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__) && !defined(__CUDA_ARCH__) #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0) #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0) diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp index 82a342eec0..af83e5cac6 100644 --- a/lib/kokkos/core/src/Kokkos_Concepts.hpp +++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp @@ -46,7 +46,14 @@ #include +// Needed for 'is_space::host_mirror_space +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + namespace Kokkos { + //Schedules for Execution Policies struct Static {}; struct Dynamic {}; @@ -59,7 +66,7 @@ struct Schedule || std::is_same::value , "Kokkos: Invalid Schedule<> type." ); - using schedule_type = Schedule; + using schedule_type = Schedule ; using type = T; }; @@ -68,11 +75,268 @@ template struct IndexType { static_assert(std::is_integral::value,"Kokkos: Invalid IndexType<>."); - using index_type = IndexType; + using index_type = IndexType ; using type = T; }; } // namespace Kokkos +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +#define KOKKOS_IMPL_IS_CONCEPT( CONCEPT ) \ + template< typename T > struct is_ ## CONCEPT { \ + private: \ + template< typename , typename = std::true_type > struct have : std::false_type {}; \ + template< typename U > struct have::type> : std::true_type {}; \ + public: \ + enum { value = is_ ## CONCEPT::template have::value }; \ + }; + +// Public concept: + +KOKKOS_IMPL_IS_CONCEPT( memory_space ) +KOKKOS_IMPL_IS_CONCEPT( memory_traits ) +KOKKOS_IMPL_IS_CONCEPT( execution_space ) +KOKKOS_IMPL_IS_CONCEPT( execution_policy ) +KOKKOS_IMPL_IS_CONCEPT( array_layout ) + +namespace Impl { + +// For backward compatibility: + +using Kokkos::is_memory_space ; +using Kokkos::is_memory_traits ; +using Kokkos::is_execution_space ; +using Kokkos::is_execution_policy ; +using Kokkos::is_array_layout ; + +// Implementation concept: + +KOKKOS_IMPL_IS_CONCEPT( iteration_pattern ) +KOKKOS_IMPL_IS_CONCEPT( schedule_type ) +KOKKOS_IMPL_IS_CONCEPT( index_type ) + +} + +#undef KOKKOS_IMPL_IS_CONCEPT + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template< class ExecutionSpace , class MemorySpace > +struct Device { + static_assert( Kokkos::is_execution_space::value + , "Execution space is not valid" ); + static_assert( Kokkos::is_memory_space::value + , "Memory space is not valid" ); + typedef ExecutionSpace execution_space; + typedef MemorySpace memory_space; + typedef Device device_type; +}; + + +template< typename T > +struct is_space { +private: + + template< typename , typename = void > + struct exe : std::false_type { typedef void space ; }; + + template< typename , typename = void > + struct mem : std::false_type { typedef void space ; }; + + template< typename , typename = void > + struct dev : std::false_type { typedef void space ; }; + + template< typename U > + struct exe::type> + : std::is_same::type + { typedef typename U::execution_space space ; }; + + template< typename U > + struct mem::type> + : std::is_same::type + { typedef typename U::memory_space space ; }; + + template< typename U > + struct dev::type> + : std::is_same::type + { typedef typename U::device_type space ; }; + + typedef typename is_space::template exe is_exe ; + typedef typename is_space::template mem is_mem ; + typedef typename is_space::template dev is_dev ; + +public: + + enum { value = is_exe::value || is_mem::value || is_dev::value }; + + typedef typename is_exe::space execution_space ; + typedef typename is_mem::space memory_space ; + + // For backward compatibility, deprecated in favor of + // Kokkos::Impl::HostMirror::host_mirror_space + + typedef typename std::conditional + < std::is_same< memory_space , Kokkos::HostSpace >::value +#if defined( KOKKOS_HAVE_CUDA ) + || std::is_same< memory_space , Kokkos::CudaUVMSpace >::value + || std::is_same< memory_space , Kokkos::CudaHostPinnedSpace >::value +#endif /* #if defined( KOKKOS_HAVE_CUDA ) */ + , memory_space + , Kokkos::HostSpace + >::type host_memory_space ; + +#if defined( KOKKOS_HAVE_CUDA ) + typedef typename std::conditional + < std::is_same< execution_space , Kokkos::Cuda >::value + , Kokkos::DefaultHostExecutionSpace , execution_space + >::type host_execution_space ; +#else + typedef execution_space host_execution_space ; +#endif + + typedef typename std::conditional + < std::is_same< execution_space , host_execution_space >::value && + std::is_same< memory_space , host_memory_space >::value + , T , Kokkos::Device< host_execution_space , host_memory_space > + >::type host_mirror_space ; +}; + +// For backward compatiblity + +namespace Impl { + +using Kokkos::is_space ; + +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/**\brief Access relationship between DstMemorySpace and SrcMemorySpace + * + * The default case can assume accessibility for the same space. + * Specializations must be defined for different memory spaces. + */ +template< typename DstMemorySpace , typename SrcMemorySpace > +struct MemorySpaceAccess { + + static_assert( Kokkos::is_memory_space< DstMemorySpace >::value && + Kokkos::is_memory_space< SrcMemorySpace >::value + , "template arguments must be memory spaces" ); + + /**\brief Can a View (or pointer) to memory in SrcMemorySpace + * be assigned to a View (or pointer) to memory marked DstMemorySpace. + * + * 1. DstMemorySpace::execution_space == SrcMemorySpace::execution_space + * 2. All execution spaces that can access DstMemorySpace can also access + * SrcMemorySpace. + */ + enum { assignable = std::is_same::value }; + + /**\brief For all DstExecSpace::memory_space == DstMemorySpace + * DstExecSpace can access SrcMemorySpace. + */ + enum { accessible = assignable }; + + /**\brief Does a DeepCopy capability exist + * to DstMemorySpace from SrcMemorySpace + */ + enum { deepcopy = assignable }; +}; + + +/**\brief Can AccessSpace access MemorySpace ? + * + * Requires: + * Kokkos::is_space< AccessSpace >::value + * Kokkos::is_memory_space< MemorySpace >::value + * + * Can AccessSpace::execution_space access MemorySpace ? + * enum : bool { accessible }; + * + * Is View assignable from View ? + * enum : bool { assignable }; + * + * If ! accessible then through which intercessory memory space + * should a be used to deep copy memory for + * AccessSpace::execution_space + * to get access. + * When AccessSpace::memory_space == Kokkos::HostSpace + * then space is the View host mirror space. + */ +template< typename AccessSpace , typename MemorySpace > +struct SpaceAccessibility { +private: + + static_assert( Kokkos::is_space< AccessSpace >::value + , "template argument #1 must be a Kokkos space" ); + + static_assert( Kokkos::is_memory_space< MemorySpace >::value + , "template argument #2 must be a Kokkos memory space" ); + + // The input AccessSpace may be a Device + // verify that it is a valid combination of spaces. + static_assert( Kokkos::Impl::MemorySpaceAccess + < typename AccessSpace::execution_space::memory_space + , typename AccessSpace::memory_space + >::accessible + , "template argument #1 is an invalid space" ); + + typedef Kokkos::Impl::MemorySpaceAccess + < typename AccessSpace::execution_space::memory_space , MemorySpace > + exe_access ; + + typedef Kokkos::Impl::MemorySpaceAccess + < typename AccessSpace::memory_space , MemorySpace > + mem_access ; + +public: + + /**\brief Can AccessSpace::execution_space access MemorySpace ? + * + * Default based upon memory space accessibility. + * Specialization required for other relationships. + */ + enum { accessible = exe_access::accessible }; + + /**\brief Can assign to AccessSpace from MemorySpace ? + * + * Default based upon memory space accessibility. + * Specialization required for other relationships. + */ + enum { assignable = + is_memory_space< AccessSpace >::value && mem_access::assignable }; + + /**\brief Can deep copy to AccessSpace::memory_Space from MemorySpace ? */ + enum { deepcopy = mem_access::deepcopy }; + + // What intercessory space for AccessSpace::execution_space + // to be able to access MemorySpace? + // If same memory space or not accessible use the AccessSpace + // else construct a device with execution space and memory space. + typedef typename std::conditional + < std::is_same::value || + ! exe_access::accessible + , AccessSpace + , Kokkos::Device< typename AccessSpace::execution_space , MemorySpace > + >::type space ; +}; + +}} // namespace Kokkos::Impl + +//---------------------------------------------------------------------------- + #endif // KOKKOS_CORE_CONCEPTS_HPP diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index 7cde4610ee..266f750d37 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -72,6 +72,7 @@ #include #include #include +#include #ifdef KOKKOS_HAVE_CXX11 #include @@ -112,7 +113,6 @@ void fence(); //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { /* Allocate memory from a memory space. * The allocation is tracked in Kokkos memory tracking system, so @@ -155,18 +155,8 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size ) reallocate_tracked( arg_alloc , arg_alloc_size ); } -} // namespace Experimental } // namespace Kokkos - -namespace Kokkos { - -using Kokkos::Experimental::kokkos_malloc ; -using Kokkos::Experimental::kokkos_realloc ; -using Kokkos::Experimental::kokkos_free ; - -} - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index e9648b59b8..0f5ef9200a 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -1,13 +1,13 @@ /* //@HEADER // ************************************************************************ -// +// // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation -// +// // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: @@ -36,7 +36,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// +// // ************************************************************************ //@HEADER */ @@ -49,6 +49,7 @@ // and compiler environment then sets a collection of #define macros. #include +#include //---------------------------------------------------------------------------- // Have assumed a 64bit build (8byte pointers) throughout the code base. @@ -207,7 +208,7 @@ namespace Impl { template< class Functor , class Policy - , class EnableFunctor = void + , class EnableFunctor = void , class EnablePolicy = void > struct FunctorPolicyExecutionSpace; @@ -220,7 +221,7 @@ struct FunctorPolicyExecutionSpace; /// This is an implementation detail of parallel_for. Users should /// skip this and go directly to the nonmember function parallel_for. template< class FunctorType , class ExecPolicy , class ExecutionSpace = - typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space + typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space > class ParallelFor ; /// \class ParallelReduce @@ -229,7 +230,7 @@ template< class FunctorType , class ExecPolicy , class ExecutionSpace = /// This is an implementation detail of parallel_reduce. Users should /// skip this and go directly to the nonmember function parallel_reduce. template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace = - typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space + typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space > class ParallelReduce ; /// \class ParallelScan @@ -238,8 +239,8 @@ template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType /// This is an implementation detail of parallel_scan. Users should /// skip this and go directly to the documentation of the nonmember /// template function Kokkos::parallel_scan. -template< class FunctorType , class ExecPolicy , class ExecutionSapce = - typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space +template< class FunctorType , class ExecPolicy , class ExecutionSapce = + typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space > class ParallelScan ; }} diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp index 3130ee3198..84ae5ee044 100644 --- a/lib/kokkos/core/src/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp @@ -56,7 +56,7 @@ #include #include -#include +#include #include #include #include @@ -229,6 +229,39 @@ private: namespace Kokkos { namespace Impl { +template<> +struct MemorySpaceAccess + < Kokkos::CudaSpace + , Kokkos::Cuda::scratch_memory_space + > +{ + enum { assignable = false }; + enum { accessible = true }; + enum { deepcopy = false }; +}; + +#if defined( KOKKOS_USE_CUDA_UVM ) + +// If forcing use of UVM everywhere +// then must assume that CudaUVMSpace +// can be a stand-in for CudaSpace. +// This will fail when a strange host-side execution space +// that defines CudaUVMSpace as its preferredmemory space. + +template<> +struct MemorySpaceAccess + < Kokkos::CudaUVMSpace + , Kokkos::Cuda::scratch_memory_space + > +{ + enum { assignable = false }; + enum { accessible = true }; + enum { deepcopy = false }; +}; + +#endif + + template<> struct VerifyExecutionCanAccessMemorySpace < Kokkos::CudaSpace @@ -259,9 +292,6 @@ struct VerifyExecutionCanAccessMemorySpace #include #include - -#include - #include #include diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp index cd728895d0..fd9b0ad123 100644 --- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -88,6 +88,9 @@ public: void deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const ; + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name(); + /*--------------------------------*/ /** \brief Error reporting for HostSpace attempt to access CudaSpace */ static void access_error(); @@ -97,7 +100,8 @@ private: int m_device ; ///< Which Cuda device - // friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ; + static constexpr const char* m_name = "Cuda"; + friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ; }; namespace Impl { @@ -156,6 +160,14 @@ public: /** \brief If UVM capability is available */ static bool available(); + + /*--------------------------------*/ + /** \brief CudaUVMSpace specific routine */ + static int number_of_allocations(); + + /*--------------------------------*/ + + /*--------------------------------*/ CudaUVMSpace(); @@ -172,11 +184,16 @@ public: void deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const ; + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name(); + /*--------------------------------*/ private: - int m_device ; ///< Which Cuda device + + static constexpr const char* m_name = "CudaUVM"; + }; } // namespace Kokkos @@ -215,6 +232,13 @@ public: void deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const ; + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name(); + +private: + + static constexpr const char* m_name = "CudaHostPinned"; + /*--------------------------------*/ }; @@ -226,6 +250,126 @@ public: namespace Kokkos { namespace Impl { +static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaSpace >::assignable , "" ); +static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaUVMSpace >::assignable , "" ); +static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" ); + +//---------------------------------------- + +template<> +struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaSpace > { + enum { assignable = false }; + enum { accessible = false }; + enum { deepcopy = true }; +}; + +template<> +struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaUVMSpace > { + // HostSpace::execution_space != CudaUVMSpace::execution_space + enum { assignable = false }; + enum { accessible = true }; + enum { deepcopy = true }; +}; + +template<> +struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace > { + // HostSpace::execution_space == CudaHostPinnedSpace::execution_space + enum { assignable = true }; + enum { accessible = true }; + enum { deepcopy = true }; +}; + +//---------------------------------------- + +template<> +struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::HostSpace > { + enum { assignable = false }; + enum { accessible = false }; + enum { deepcopy = true }; +}; + +template<> +struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaUVMSpace > { + // CudaSpace::execution_space == CudaUVMSpace::execution_space + enum { assignable = true }; + enum { accessible = true }; + enum { deepcopy = true }; +}; + +template<> +struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace > { + // CudaSpace::execution_space != CudaHostPinnedSpace::execution_space + enum { assignable = false }; + enum { accessible = true }; // CudaSpace::execution_space + enum { deepcopy = true }; +}; + +//---------------------------------------- +// CudaUVMSpace::execution_space == Cuda +// CudaUVMSpace accessible to both Cuda and Host + +template<> +struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::HostSpace > { + enum { assignable = false }; + enum { accessible = false }; // Cuda cannot access HostSpace + enum { deepcopy = true }; +}; + +template<> +struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaSpace > { + // CudaUVMSpace::execution_space == CudaSpace::execution_space + // Can access CudaUVMSpace from Host but cannot access CudaSpace from Host + enum { assignable = false }; + + // CudaUVMSpace::execution_space can access CudaSpace + enum { accessible = true }; + enum { deepcopy = true }; +}; + +template<> +struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaHostPinnedSpace > { + // CudaUVMSpace::execution_space != CudaHostPinnedSpace::execution_space + enum { assignable = false }; + enum { accessible = true }; // CudaUVMSpace::execution_space + enum { deepcopy = true }; +}; + + +//---------------------------------------- +// CudaHostPinnedSpace::execution_space == HostSpace::execution_space +// CudaHostPinnedSpace accessible to both Cuda and Host + +template<> +struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace > { + enum { assignable = false }; // Cannot access from Cuda + enum { accessible = true }; // CudaHostPinnedSpace::execution_space + enum { deepcopy = true }; +}; + +template<> +struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaSpace > { + enum { assignable = false }; // Cannot access from Host + enum { accessible = false }; + enum { deepcopy = true }; +}; + +template<> +struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaUVMSpace > { + enum { assignable = false }; // different execution_space + enum { accessible = true }; // same accessibility + enum { deepcopy = true }; +}; + +//---------------------------------------- + +}} // namespace Kokkos::Impl + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + void DeepCopyAsyncCuda( void * dst , const void * src , size_t n); template<> struct DeepCopy< CudaSpace , CudaSpace , Cuda> @@ -553,7 +697,6 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaHost //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { template<> @@ -791,7 +934,6 @@ public: }; } // namespace Impl -} // namespace Experimental } // namespace Kokkos //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index 5834fc04db..db4d67ae7d 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -52,6 +52,7 @@ #include #include #include + //---------------------------------------------------------------------------- namespace Kokkos { @@ -82,7 +83,6 @@ class RangePolicy : public Impl::PolicyTraits { private: - typedef Impl::PolicyTraits traits; typename traits::execution_space m_space ; @@ -90,8 +90,8 @@ private: typename traits::index_type m_end ; typename traits::index_type m_granularity ; typename traits::index_type m_granularity_mask ; -public: +public: //! Tag this class as an execution policy typedef RangePolicy execution_policy; typedef typename traits::index_type member_type ; @@ -100,7 +100,6 @@ public: KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; } KOKKOS_INLINE_FUNCTION member_type end() const { return m_end ; } - //TODO: find a better workaround for Clangs weird instantiation order // This thing is here because of an instantiation error, where the RangePolicy is inserted into FunctorValue Traits, which // tries decltype on the operator. It tries to do this even though the first argument of parallel for clearly doesn't match. @@ -135,47 +134,45 @@ public: , work_begin , work_end ) {} - public: +public: + /** \brief return chunk_size */ + inline member_type chunk_size() const { + return m_granularity; + } - /** \brief return chunk_size */ - inline member_type chunk_size() const { - return m_granularity; - } + /** \brief set chunk_size to a discrete value*/ + inline RangePolicy set_chunk_size(int chunk_size_) const { + RangePolicy p = *this; + p.m_granularity = chunk_size_; + p.m_granularity_mask = p.m_granularity - 1; + return p; + } - /** \brief set chunk_size to a discrete value*/ - inline RangePolicy set_chunk_size(int chunk_size_) const { - RangePolicy p = *this; - p.m_granularity = chunk_size_; - p.m_granularity_mask = p.m_granularity - 1; - return p; - } +private: + /** \brief finalize chunk_size if it was set to AUTO*/ + inline void set_auto_chunk_size() { - private: - /** \brief finalize chunk_size if it was set to AUTO*/ - inline void set_auto_chunk_size() { + typename traits::index_type concurrency = traits::execution_space::concurrency(); + if( concurrency==0 ) concurrency=1; - typename traits::index_type concurrency = traits::execution_space::concurrency(); - if( concurrency==0 ) concurrency=1; + if(m_granularity > 0) { + if(!Impl::is_integral_power_of_two( m_granularity )) + Kokkos::abort("RangePolicy blocking granularity must be power of two" ); + } - if(m_granularity > 0) { - if(!Impl::is_integral_power_of_two( m_granularity )) - Kokkos::abort("RangePolicy blocking granularity must be power of two" ); - } + member_type new_chunk_size = 1; + while(new_chunk_size*100*concurrency < m_end-m_begin) + new_chunk_size *= 2; + if(new_chunk_size < 128) { + new_chunk_size = 1; + while( (new_chunk_size*40*concurrency < m_end-m_begin ) && (new_chunk_size<128) ) + new_chunk_size*=2; + } + m_granularity = new_chunk_size; + m_granularity_mask = m_granularity - 1; + } - - member_type new_chunk_size = 1; - while(new_chunk_size*100*concurrency < m_end-m_begin) - new_chunk_size *= 2; - if(new_chunk_size < 128) { - new_chunk_size = 1; - while( (new_chunk_size*40*concurrency < m_end-m_begin ) && (new_chunk_size<128) ) - new_chunk_size*=2; - } - m_granularity = new_chunk_size; - m_granularity_mask = m_granularity - 1; - } - - public: +public: /** \brief Subrange for a partition's rank and size. * * Typically used to partition a range over a group of threads. @@ -212,16 +209,15 @@ public: if ( range.end() < m_end ) m_end = range.end() ; } } - private: - member_type m_begin ; - member_type m_end ; - WorkRange(); - WorkRange & operator = ( const WorkRange & ); + private: + member_type m_begin ; + member_type m_end ; + WorkRange(); + WorkRange & operator = ( const WorkRange & ); }; }; - } // namespace Kokkos //---------------------------------------------------------------------------- @@ -231,7 +227,6 @@ namespace Kokkos { namespace Impl { - template< class ExecSpace, class ... Properties> class TeamPolicyInternal: public Impl::PolicyTraits { private: @@ -245,6 +240,10 @@ public: * This size takes into account execution space concurrency limitations and * scratch memory space limitations for reductions, team reduce/scan, and * team shared memory. + * + * This function only works for single-operator functors. + * With multi-operator functors it cannot be determined + * which operator will be called. */ template< class FunctorType > static int team_size_max( const FunctorType & ); @@ -254,6 +253,10 @@ public: * This size takes into account execution space concurrency limitations and * scratch memory space limitations for reductions, team reduce/scan, and * team shared memory. + * + * This function only works for single-operator functors. + * With multi-operator functors it cannot be determined + * which operator will be called. */ template< class FunctorType > static int team_size_recommended( const FunctorType & ); @@ -344,9 +347,7 @@ public: KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const ; }; }; -} -namespace Impl { struct PerTeamValue { int value; PerTeamValue(int arg); @@ -356,12 +357,12 @@ namespace Impl { int value; PerThreadValue(int arg); }; + } Impl::PerTeamValue PerTeam(const int& arg); Impl::PerThreadValue PerThread(const int& arg); - /** \brief Execution policy for parallel work over a league of teams of threads. * * The work functor is called for each thread of each team such that @@ -443,10 +444,6 @@ public: }; -} // namespace Kokkos - -namespace Kokkos { - namespace Impl { template @@ -484,8 +481,8 @@ public: KOKKOS_INLINE_FUNCTION TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread - , const iType& arg_end - ) + , const iType& arg_end + ) : start( ibegin( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) ) , end( iend( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) ) , thread( arg_thread ) @@ -502,32 +499,33 @@ public: {} }; - template - struct ThreadVectorRangeBoundariesStruct { - typedef iType index_type; - enum {start = 0}; - const iType end; - enum {increment = 1}; +template +struct ThreadVectorRangeBoundariesStruct { + typedef iType index_type; + enum {start = 0}; + const iType end; + enum {increment = 1}; - KOKKOS_INLINE_FUNCTION - ThreadVectorRangeBoundariesStruct (const TeamMemberType& thread, const iType& count): - end( count ) - {} - }; + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct ( const TeamMemberType, const iType& count ) : end( count ) {} + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct ( const iType& count ) : end( count ) {} +}; - template - struct ThreadSingleStruct { - const TeamMemberType& team_member; - KOKKOS_INLINE_FUNCTION - ThreadSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){} - }; +template +struct ThreadSingleStruct { + const TeamMemberType& team_member; + KOKKOS_INLINE_FUNCTION + ThreadSingleStruct( const TeamMemberType& team_member_ ) : team_member( team_member_ ) {} +}; + +template +struct VectorSingleStruct { + const TeamMemberType& team_member; + KOKKOS_INLINE_FUNCTION + VectorSingleStruct( const TeamMemberType& team_member_ ) : team_member( team_member_ ) {} +}; - template - struct VectorSingleStruct { - const TeamMemberType& team_member; - KOKKOS_INLINE_FUNCTION - VectorSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){} - }; } // namespace Impl /** \brief Execution policy for parallel work over a threads within a team. @@ -538,7 +536,8 @@ public: */ template KOKKOS_INLINE_FUNCTION -Impl::TeamThreadRangeBoundariesStruct TeamThreadRange(const TeamMemberType&, const iType& count); +Impl::TeamThreadRangeBoundariesStruct +TeamThreadRange( const TeamMemberType&, const iType& count ); /** \brief Execution policy for parallel work over a threads within a team. * @@ -546,9 +545,10 @@ Impl::TeamThreadRangeBoundariesStruct TeamThreadRange(cons * This policy is used together with a parallel pattern as a nested layer within a kernel launched * with the TeamPolicy. This variant expects a begin and end. So the range is (begin,end]. */ -template +template KOKKOS_INLINE_FUNCTION -Impl::TeamThreadRangeBoundariesStruct TeamThreadRange(const TeamMemberType&, const iType& begin, const iType& end); +Impl::TeamThreadRangeBoundariesStruct::type, TeamMemberType> +TeamThreadRange( const TeamMemberType&, const iType1& begin, const iType2& end ); /** \brief Execution policy for a vector parallel loop. * @@ -558,13 +558,12 @@ Impl::TeamThreadRangeBoundariesStruct TeamThreadRange(cons */ template KOKKOS_INLINE_FUNCTION -Impl::ThreadVectorRangeBoundariesStruct ThreadVectorRange(const TeamMemberType&, const iType& count); +Impl::ThreadVectorRangeBoundariesStruct +ThreadVectorRange( const TeamMemberType&, const iType& count ); } // namespace Kokkos - #endif /* #define KOKKOS_EXECPOLICY_HPP */ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- - diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp index e02689b0f9..10e735fe00 100644 --- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp @@ -46,7 +46,6 @@ #include -#include /*--------------------------------------------------------------------------*/ #ifdef KOKKOS_HAVE_HBWSPACE @@ -148,11 +147,14 @@ public: void deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const ; + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name(); + private: AllocationMechanism m_alloc_mech ; - - friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ; + static constexpr const char* m_name = "HBW"; + friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ; }; } // namespace Experimental @@ -162,7 +164,6 @@ private: //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace Impl { template<> @@ -239,9 +240,33 @@ public: }; } // namespace Impl -} // namespace Experimental } // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::Experimental::HBWSpace >::assignable , "" ); + +template<> +struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace > { + enum { assignable = true }; + enum { accessible = true }; + enum { deepcopy = true }; +}; + +template<> +struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace> { + enum { assignable = false }; + enum { accessible = true }; + enum { deepcopy = true }; +}; + +}} + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index 5fe686559a..0292dd8a6c 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -50,12 +50,12 @@ #include #include +#include #include #include #include - -#include +#include /*--------------------------------------------------------------------------*/ @@ -155,11 +155,14 @@ public: void deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const ; + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name(); + private: AllocationMechanism m_alloc_mech ; - - friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ; + static constexpr const char* m_name = "Host"; + friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ; }; } // namespace Kokkos @@ -168,7 +171,47 @@ private: //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { +namespace Impl { + +static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::HostSpace >::assignable , "" ); + + +template< typename S > +struct HostMirror { +private: + + // If input execution space can access HostSpace then keep it. + // Example: Kokkos::OpenMP can access, Kokkos::Cuda cannot + enum { keep_exe = Kokkos::Impl::MemorySpaceAccess + < typename S::execution_space::memory_space , Kokkos::HostSpace > + ::accessible }; + + // If HostSpace can access memory space then keep it. + // Example: Cannot access Kokkos::CudaSpace, can access Kokkos::CudaUVMSpace + enum { keep_mem = Kokkos::Impl::MemorySpaceAccess + < Kokkos::HostSpace , typename S::memory_space >::accessible }; + +public: + + typedef typename std::conditional + < keep_exe && keep_mem /* Can keep whole space */ + , S + , typename std::conditional + < keep_mem /* Can keep memory space, use default Host execution space */ + , Kokkos::Device< Kokkos::HostSpace::execution_space + , typename S::memory_space > + , Kokkos::HostSpace + >::type + >::type Space ; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { namespace Impl { template<> @@ -245,7 +288,6 @@ public: }; } // namespace Impl -} // namespace Experimental } // namespace Kokkos //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp index c77c33703b..8ffbc8bb03 100644 --- a/lib/kokkos/core/src/Kokkos_Layout.hpp +++ b/lib/kokkos/core/src/Kokkos_Layout.hpp @@ -82,7 +82,7 @@ struct LayoutLeft { LayoutLeft & operator = ( LayoutLeft && ) = default ; KOKKOS_INLINE_FUNCTION - constexpr + explicit constexpr LayoutLeft( size_t N0 = 0 , size_t N1 = 0 , size_t N2 = 0 , size_t N3 = 0 , size_t N4 = 0 , size_t N5 = 0 , size_t N6 = 0 , size_t N7 = 0 ) : dimension { N0 , N1 , N2 , N3 , N4 , N5 , N6 , N7 } {} @@ -114,7 +114,7 @@ struct LayoutRight { LayoutRight & operator = ( LayoutRight && ) = default ; KOKKOS_INLINE_FUNCTION - constexpr + explicit constexpr LayoutRight( size_t N0 = 0 , size_t N1 = 0 , size_t N2 = 0 , size_t N3 = 0 , size_t N4 = 0 , size_t N5 = 0 , size_t N6 = 0 , size_t N7 = 0 ) : dimension { N0 , N1 , N2 , N3 , N4 , N5 , N6 , N7 } {} @@ -132,6 +132,11 @@ struct LayoutStride { size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ; size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ; + LayoutStride( LayoutStride const & ) = default ; + LayoutStride( LayoutStride && ) = default ; + LayoutStride & operator = ( LayoutStride const & ) = default ; + LayoutStride & operator = ( LayoutStride && ) = default ; + /** \brief Compute strides from ordered dimensions. * * Values of order uniquely form the set [0..rank) @@ -164,7 +169,8 @@ struct LayoutStride { return tmp ; } - KOKKOS_INLINE_FUNCTION constexpr + KOKKOS_INLINE_FUNCTION + explicit constexpr LayoutStride( size_t N0 = 0 , size_t S0 = 0 , size_t N1 = 0 , size_t S1 = 0 , size_t N2 = 0 , size_t S2 = 0 @@ -220,7 +226,7 @@ struct LayoutTileLeft { LayoutTileLeft & operator = ( LayoutTileLeft && ) = default ; KOKKOS_INLINE_FUNCTION - constexpr + explicit constexpr LayoutTileLeft( size_t argN0 = 0 , size_t argN1 = 0 , size_t argN2 = 0 , size_t argN3 = 0 , size_t argN4 = 0 , size_t argN5 = 0 , size_t argN6 = 0 , size_t argN7 = 0 ) diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 7d1e59af5e..fbe699deb8 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -114,11 +114,11 @@ #error "#include did not define CUDA_VERSION" #endif -#if ( CUDA_VERSION < 6050 ) -// CUDA supports (inofficially) C++11 in device code starting with -// version 6.5. This includes auto type and device code internal +#if ( CUDA_VERSION < 7000 ) +// CUDA supports C++11 in device code starting with +// version 7.0. This includes auto type and device code internal // lambdas. -#error "Cuda version 6.5 or greater required" +#error "Cuda version 7.0 or greater required" #endif #if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 ) @@ -127,16 +127,19 @@ #endif #ifdef KOKKOS_CUDA_USE_LAMBDA -#if ( CUDA_VERSION < 7000 ) -// CUDA supports C++11 lambdas generated in host code to be given -// to the device starting with version 7.5. But the release candidate (7.5.6) -// still identifies as 7.0 -#error "Cuda version 7.5 or greater required for host-to-device Lambda support" +#if ( CUDA_VERSION < 7050 ) + // CUDA supports C++11 lambdas generated in host code to be given + // to the device starting with version 7.5. But the release candidate (7.5.6) + // still identifies as 7.0 + #error "Cuda version 7.5 or greater required for host-to-device Lambda support" #endif -#if ( CUDA_VERSION < 8000 ) -#define KOKKOS_LAMBDA [=]__device__ +#if ( CUDA_VERSION < 8000 ) && defined(__NVCC__) + #define KOKKOS_LAMBDA [=]__device__ #else -#define KOKKOS_LAMBDA [=]__host__ __device__ + #define KOKKOS_LAMBDA [=]__host__ __device__ + #if defined( KOKKOS_HAVE_CXX1Z ) + #define KOKKOS_CLASS_LAMBDA [=,*this] __host__ __device__ + #endif #endif #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1 #endif @@ -145,7 +148,7 @@ #if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) // Cuda version 8.0 still needs the functor wrapper - #if (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ ) + #if (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ ) && defined(__NVCC__) #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER #endif #endif @@ -153,13 +156,12 @@ /*--------------------------------------------------------------------------*/ /* Language info: C++, CUDA, OPENMP */ -#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA ) +#if defined( KOKKOS_HAVE_CUDA ) // Compiling Cuda code to 'ptx' #define KOKKOS_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ #define KOKKOS_INLINE_FUNCTION __device__ __host__ inline #define KOKKOS_FUNCTION __device__ __host__ - #endif /* #if defined( __CUDA_ARCH__ ) */ #if defined( _OPENMP ) @@ -184,10 +186,12 @@ #else #if defined( KOKKOS_HAVE_CXX11 ) && ! defined( KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA ) + #if !defined (KOKKOS_HAVE_CUDA) // Compiling with clang for Cuda does not work with LAMBDAs either // CUDA (including version 6.5) does not support giving lambdas as // arguments to global functions. Thus its not currently possible // to dispatch lambdas from the host. #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1 + #endif #endif #endif /* #if defined( __NVCC__ ) */ @@ -195,7 +199,11 @@ #define KOKKOS_LAMBDA [=] #endif -#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */ +#if defined( KOKKOS_HAVE_CXX1Z ) && !defined (KOKKOS_CLASS_LAMBDA) + #define KOKKOS_CLASS_LAMBDA [=,*this] +#endif + +//#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */ /* Intel compiler for host code */ @@ -243,7 +251,7 @@ #endif #endif -#endif /* #if ! defined( __CUDA_ARCH__ ) */ +//#endif /* #if ! defined( __CUDA_ARCH__ ) */ /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ @@ -257,6 +265,20 @@ #define KOKKOS_HAVE_PRAGMA_VECTOR 1 #define KOKKOS_HAVE_PRAGMA_SIMD 1 + #define KOKKOS_RESTRICT __restrict__ + + #ifndef KOKKOS_ALIGN + #define KOKKOS_ALIGN(size) __attribute__((aligned(size))) + #endif + + #ifndef KOKKOS_ALIGN_PTR + #define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size))) + #endif + + #ifndef KOKKOS_ALIGN_SIZE + #define KOKKOS_ALIGN_SIZE 64 + #endif + #if ( 1400 > KOKKOS_COMPILER_INTEL ) #if ( 1300 > KOKKOS_COMPILER_INTEL ) #error "Compiling with Intel version earlier than 13.0 is not supported. Official minimal version is 14.0." @@ -264,11 +286,11 @@ #warning "Compiling with Intel version 13.x probably works but is not officially supported. Official minimal version is 14.0." #endif #endif - #if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 ) + #if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 ) #define KOKKOS_ENABLE_ASM 1 #endif - #if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_FORCEINLINE_FUNCTION ) + #if ! defined( KOKKOS_FORCEINLINE_FUNCTION ) #if !defined (_WIN32) #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) #else @@ -335,14 +357,11 @@ #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) #endif - #if ! defined( KOKKOS_ENABLE_ASM ) && \ - ! ( defined( __powerpc) || \ - defined(__powerpc__) || \ - defined(__powerpc64__) || \ - defined(__POWERPC__) || \ - defined(__ppc__) || \ - defined(__ppc64__) || \ - defined(__PGIC__) ) + #if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( __PGIC__ ) && \ + ( defined( __amd64 ) || \ + defined( __amd64__ ) || \ + defined( __x86_64 ) || \ + defined( __x86_64__ ) ) #define KOKKOS_ENABLE_ASM 1 #endif @@ -385,10 +404,30 @@ #define KOKKOS_FUNCTION /**/ #endif + +//---------------------------------------------------------------------------- +///** Define empty macro for restrict if necessary: */ + +#if ! defined(KOKKOS_RESTRICT) +#define KOKKOS_RESTRICT +#endif + //---------------------------------------------------------------------------- /** Define Macro for alignment: */ +#if ! defined KOKKOS_ALIGN_SIZE +#define KOKKOS_ALIGN_SIZE 16 +#endif + +#if ! defined(KOKKOS_ALIGN) +#define KOKKOS_ALIGN(size) __attribute__((aligned(size))) +#endif + +#if ! defined(KOKKOS_ALIGN_PTR) +#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size))) +#endif + #if ! defined(KOKKOS_ALIGN_16) -#define KOKKOS_ALIGN_16 __attribute__((aligned(16))) +#define KOKKOS_ALIGN_16 KOKKOS_ALIGN(16) #endif //---------------------------------------------------------------------------- @@ -456,10 +495,6 @@ * are no longer supported. */ -#if defined( KOKKOS_USING_DEPRECATED_VIEW ) -#error "Kokkos deprecated View has been removed" -#endif - #define KOKKOS_USING_EXP_VIEW 1 #define KOKKOS_USING_EXPERIMENTAL_VIEW diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp index d843f7c9a1..e4f895b7d3 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -49,7 +49,7 @@ #include #include #include -#include +#include #include #include @@ -70,12 +70,6 @@ //#define KOKKOS_MEMPOOL_PRINT_PAGE_INFO //#define KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO -// A superblock is considered full when this percentage of its pages are full. -#define KOKKOS_MEMPOOL_SB_FULL_FRACTION 0.80 - -// A page is considered full when this percentage of its blocks are full. -#define KOKKOS_MEMPOOL_PAGE_FULL_FRACTION 0.875 // 28 / 32 - //---------------------------------------------------------------------------- namespace Kokkos { @@ -128,7 +122,7 @@ struct bitset_count { dst += src; } KOKKOS_INLINE_FUNCTION - void operator()( size_type i, value_type & count) const + void operator()( size_type i, value_type & count ) const { count += Kokkos::Impl::bit_count( m_words[i] ); } @@ -183,7 +177,7 @@ public: size_type count() const { - size_type val; + size_type val = 0; bitset_count< Bitset > bc( m_words, m_num_words, val ); return val; } @@ -232,6 +226,20 @@ public: return atomic_fetch_and( &m_words[ word_pos ], ~mask ) & mask; } + KOKKOS_FORCEINLINE_FUNCTION + Kokkos::pair< bool, word_type > + fetch_word_set( size_type i ) const + { + size_type word_pos = i >> LG_WORD_SIZE; + word_type mask = word_type(1) << ( i & WORD_MASK ); + + Kokkos::pair result; + result.second = atomic_fetch_or( &m_words[ word_pos ], mask ); + result.first = !( result.second & mask ); + + return result; + } + KOKKOS_FORCEINLINE_FUNCTION Kokkos::pair< bool, word_type > fetch_word_reset( size_type i ) const @@ -247,12 +255,10 @@ public: } KOKKOS_FORCEINLINE_FUNCTION - Kokkos::pair< bool, size_type > - set_any_in_word( size_type i, word_type & prev_val ) const + Kokkos::pair< bool, word_type > + set_any_in_word( size_type & pos ) const { - prev_val = 0; - - size_type word_pos = i >> LG_WORD_SIZE; + size_type word_pos = pos >> LG_WORD_SIZE; word_type word = volatile_load( &m_words[ word_pos ] ); // Loop until there are no more unset bits in the word. @@ -261,28 +267,26 @@ public: size_type bit = Kokkos::Impl::bit_scan_forward( ~word ); // Try to set the bit. - word_type mask = word_type(1) << bit; + word_type mask = word_type(1) << bit; word = atomic_fetch_or( &m_words[ word_pos ], mask ); if ( !( word & mask ) ) { // Successfully set the bit. - prev_val = word; + pos = ( word_pos << LG_WORD_SIZE ) + bit; - return Kokkos::pair( true, ( word_pos << LG_WORD_SIZE ) + bit ); + return Kokkos::pair( true, word ); } } // Didn't find a free bit in this word. - return Kokkos::pair( false, i ); + return Kokkos::pair( false, word_type(0) ); } KOKKOS_FORCEINLINE_FUNCTION - Kokkos::pair< bool, size_type > - set_any_in_word( size_type i, word_type & prev_val, word_type word_mask ) const + Kokkos::pair< bool, word_type > + set_any_in_word( size_type & pos, word_type word_mask ) const { - prev_val = 0; - - size_type word_pos = i >> LG_WORD_SIZE; + size_type word_pos = pos >> LG_WORD_SIZE; word_type word = volatile_load( &m_words[ word_pos ] ); word = ( ~word ) & word_mask; @@ -292,30 +296,28 @@ public: size_type bit = Kokkos::Impl::bit_scan_forward( word ); // Try to set the bit. - word_type mask = word_type(1) << bit; + word_type mask = word_type(1) << bit; word = atomic_fetch_or( &m_words[ word_pos ], mask ); if ( !( word & mask ) ) { // Successfully set the bit. - prev_val = word; + pos = ( word_pos << LG_WORD_SIZE ) + bit; - return Kokkos::pair( true, ( word_pos << LG_WORD_SIZE ) + bit ); + return Kokkos::pair( true, word ); } word = ( ~word ) & word_mask; } // Didn't find a free bit in this word. - return Kokkos::pair( false, i ); + return Kokkos::pair( false, word_type(0) ); } KOKKOS_FORCEINLINE_FUNCTION - Kokkos::pair< bool, size_type > - reset_any_in_word( size_type i, word_type & prev_val ) const + Kokkos::pair< bool, word_type > + reset_any_in_word( size_type & pos ) const { - prev_val = 0; - - size_type word_pos = i >> LG_WORD_SIZE; + size_type word_pos = pos >> LG_WORD_SIZE; word_type word = volatile_load( &m_words[ word_pos ] ); // Loop until there are no more set bits in the word. @@ -324,28 +326,26 @@ public: size_type bit = Kokkos::Impl::bit_scan_forward( word ); // Try to reset the bit. - word_type mask = word_type(1) << bit; + word_type mask = word_type(1) << bit; word = atomic_fetch_and( &m_words[ word_pos ], ~mask ); if ( word & mask ) { // Successfully reset the bit. - prev_val = word; + pos = ( word_pos << LG_WORD_SIZE ) + bit; - return Kokkos::pair( true, ( word_pos << LG_WORD_SIZE ) + bit ); + return Kokkos::pair( true, word ); } } // Didn't find a free bit in this word. - return Kokkos::pair( false, i ); + return Kokkos::pair( false, word_type(0) ); } KOKKOS_FORCEINLINE_FUNCTION - Kokkos::pair< bool, size_type > - reset_any_in_word( size_type i, word_type & prev_val, word_type word_mask ) const + Kokkos::pair< bool, word_type > + reset_any_in_word( size_type & pos, word_type word_mask ) const { - prev_val = 0; - - size_type word_pos = i >> LG_WORD_SIZE; + size_type word_pos = pos >> LG_WORD_SIZE; word_type word = volatile_load( &m_words[ word_pos ] ); word = word & word_mask; @@ -355,21 +355,21 @@ public: size_type bit = Kokkos::Impl::bit_scan_forward( word ); // Try to reset the bit. - word_type mask = word_type(1) << bit; + word_type mask = word_type(1) << bit; word = atomic_fetch_and( &m_words[ word_pos ], ~mask ); if ( word & mask ) { // Successfully reset the bit. - prev_val = word; + pos = ( word_pos << LG_WORD_SIZE ) + bit; - return Kokkos::pair( true, ( word_pos << LG_WORD_SIZE ) + bit ); + return Kokkos::pair( true, word ); } word = word & word_mask; } // Didn't find a free bit in this word. - return Kokkos::pair( false, i ); + return Kokkos::pair( false, word_type(0) ); } }; @@ -442,7 +442,7 @@ struct create_histogram { total_allocated_blocks += page_allocated_blocks; - atomic_fetch_add( &m_page_histogram(page_allocated_blocks), 1 ); + atomic_increment( &m_page_histogram(page_allocated_blocks) ); } r.first += double(total_allocated_blocks) / blocks_per_sb; @@ -609,7 +609,7 @@ public: }; private: - typedef Impl::SharedAllocationTracker Tracker; + typedef Kokkos::Impl::SharedAllocationTracker Tracker; typedef View< uint32_t *, device_type > UInt32View; typedef View< SuperblockHeader *, device_type > SBHeaderView; @@ -726,11 +726,11 @@ public: // Allocate memory for Views. This is done here instead of at construction // so that the runtime checks can be performed before allocating memory. - resize(m_active, m_num_block_size ); - resize(m_sb_header, m_num_sb ); + resize( m_active, m_num_block_size ); + resize( m_sb_header, m_num_sb ); // Allocate superblock memory. - typedef Impl::SharedAllocationRecord< backend_memory_space, void > SharedRecord; + typedef Kokkos::Impl::SharedAllocationRecord< backend_memory_space, void > SharedRecord; SharedRecord * rec = SharedRecord::allocate( memspace, "mempool", m_total_size ); @@ -751,10 +751,15 @@ public: m_ceil_num_sb * m_num_block_size ); // Initialize all active superblocks to be invalid. - typename UInt32View::HostMirror host_active = create_mirror_view(m_active); - for (size_t i = 0; i < m_num_block_size; ++i) host_active(i) = INVALID_SUPERBLOCK; + typename UInt32View::HostMirror host_active = create_mirror_view( m_active ); + for ( size_t i = 0; i < m_num_block_size; ++i ) host_active(i) = INVALID_SUPERBLOCK; + deep_copy( m_active, host_active ); - deep_copy(m_active, host_active); + // A superblock is considered full when this percentage of its pages are full. + const double superblock_full_fraction = .8; + + // A page is considered full when this percentage of its blocks are full. + const double page_full_fraction = .875; // Initialize the blocksize info. for ( size_t i = 0; i < m_num_block_size; ++i ) { @@ -767,7 +772,7 @@ public: // Set the full level for the superblock. m_blocksize_info[i].m_sb_full_level = - static_cast( pages_per_sb * KOKKOS_MEMPOOL_SB_FULL_FRACTION ); + static_cast( pages_per_sb * superblock_full_fraction ); if ( m_blocksize_info[i].m_sb_full_level == 0 ) { m_blocksize_info[i].m_sb_full_level = 1; @@ -778,7 +783,7 @@ public: blocks_per_sb < BLOCKS_PER_PAGE ? blocks_per_sb : BLOCKS_PER_PAGE; m_blocksize_info[i].m_page_full_level = - static_cast( blocks_per_page * KOKKOS_MEMPOOL_PAGE_FULL_FRACTION ); + static_cast( blocks_per_page * page_full_fraction ); if ( m_blocksize_info[i].m_page_full_level == 0 ) { m_blocksize_info[i].m_page_full_level = 1; @@ -820,7 +825,7 @@ public: /// \brief The actual block size allocated given alloc_size. KOKKOS_INLINE_FUNCTION size_t allocate_block_size( const size_t alloc_size ) const - { return size_t(1) << ( get_block_size_index( alloc_size ) + LG_MIN_BLOCK_SIZE); } + { return size_t(1) << ( get_block_size_index( alloc_size ) + LG_MIN_BLOCK_SIZE ); } /// \brief Allocate a chunk of memory. /// \param alloc_size Size of the requested allocated in number of bytes. @@ -834,27 +839,41 @@ public: // Only support allocations up to the superblock size. Just return 0 // (failed allocation) for any size above this. - if (alloc_size <= m_sb_size ) + if ( alloc_size <= m_sb_size ) { int block_size_id = get_block_size_index( alloc_size ); uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb; uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb; + +#ifdef KOKKOS_CUDA_CLANG_WORKAROUND + // Without this test it looks like pages_per_sb might come back wrong. + if ( pages_per_sb == 0 ) return NULL; +#endif + unsigned word_size = blocks_per_sb > 32 ? 32 : blocks_per_sb; unsigned word_mask = ( uint64_t(1) << word_size ) - 1; + // Instead of forcing an atomic read to guarantee the updated value, + // reading the old value is actually beneficial because more threads will + // attempt allocations on the old active superblock instead of waiting on + // the new active superblock. This will help hide the latency of + // switching the active superblock. uint32_t sb_id = volatile_load( &m_active(block_size_id) ); - // If the active is locked, keep reading it until the lock is released. + // If the active is locked, keep reading it atomically until the lock is + // released. while ( sb_id == SUPERBLOCK_LOCK ) { - sb_id = volatile_load( &m_active(block_size_id) ); + sb_id = atomic_fetch_or( &m_active(block_size_id), uint32_t(0) ); } + load_fence(); + bool allocation_done = false; - while (!allocation_done) { + while ( !allocation_done ) { bool need_new_sb = false; - if (sb_id != INVALID_SUPERBLOCK) { + if ( sb_id != INVALID_SUPERBLOCK ) { // Use the value from the clock register as the hash value. uint64_t hash_val = get_clock_register(); @@ -875,12 +894,11 @@ public: bool search_done = false; - while (!search_done) { - bool success; - unsigned prev_val; + while ( !search_done ) { + bool success = false; + unsigned prev_val = 0; - Kokkos::tie( success, pos ) = - m_sb_blocks.set_any_in_word( pos, prev_val, word_mask ); + Kokkos::tie( success, prev_val ) = m_sb_blocks.set_any_in_word( pos, word_mask ); if ( !success ) { if ( ++pages_searched >= pages_per_sb ) { @@ -905,6 +923,8 @@ public: } else { // Reserved a memory location to allocate. + memory_fence(); + search_done = true; allocation_done = true; @@ -918,7 +938,7 @@ public: if ( used_bits == 0 ) { // This page was empty. Decrement the number of empty pages for // the superblock. - atomic_fetch_sub( &m_sb_header(sb_id).m_empty_pages, 1 ); + atomic_decrement( &m_sb_header(sb_id).m_empty_pages ); } else if ( used_bits == m_blocksize_info[block_size_id].m_page_full_level - 1 ) { @@ -962,7 +982,7 @@ public: #ifdef KOKKOS_MEMPOOL_PRINT_INFO else { printf( "** Requested allocation size (%zu) larger than superblock size (%lu). **\n", - alloc_size, m_sb_size); + alloc_size, m_sb_size ); #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST fflush( stdout ); #endif @@ -997,8 +1017,10 @@ public: uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE; uint32_t pos_rel = offset >> lg_block_size; - bool success; - unsigned prev_val; + bool success = false; + unsigned prev_val = 0; + + memory_fence(); Kokkos::tie( success, prev_val ) = m_sb_blocks.fetch_word_reset( pos_base + pos_rel ); @@ -1023,7 +1045,7 @@ public: volatile_store( &m_sb_header(sb_id).m_empty_pages, uint32_t(0) ); volatile_store( &m_sb_header(sb_id).m_lg_block_size, uint32_t(0) ); - memory_fence(); + store_fence(); m_empty_sb.set( sb_id ); } @@ -1088,7 +1110,7 @@ public: printf( "\n" ); #ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO - typename SBHeaderView::HostMirror host_sb_header = create_mirror_view(m_sb_header); + typename SBHeaderView::HostMirror host_sb_header = create_mirror_view( m_sb_header ); deep_copy( host_sb_header, m_sb_header ); UInt32View num_allocated_blocks( "Allocated Blocks", m_num_sb ); @@ -1101,7 +1123,7 @@ public: } typename UInt32View::HostMirror host_num_allocated_blocks = - create_mirror_view(num_allocated_blocks); + create_mirror_view( num_allocated_blocks ); deep_copy( host_num_allocated_blocks, num_allocated_blocks ); // Print header info of all superblocks. @@ -1135,7 +1157,7 @@ public: m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result ); } - typename UInt32View::HostMirror host_page_histogram = create_mirror_view(page_histogram); + typename UInt32View::HostMirror host_page_histogram = create_mirror_view( page_histogram ); deep_copy( host_page_histogram, page_histogram ); // Find the used and total pages and blocks. @@ -1158,8 +1180,8 @@ public: double percent_used_blocks = total_blocks == 0 ? 0.0 : double(used_blocks) / total_blocks; // Count active superblocks. - typename UInt32View::HostMirror host_active = create_mirror_view(m_active); - deep_copy(host_active, m_active); + typename UInt32View::HostMirror host_active = create_mirror_view( m_active ); + deep_copy( host_active, m_active ); unsigned num_active_sb = 0; for ( size_t i = 0; i < m_num_block_size; ++i ) { @@ -1224,6 +1246,7 @@ public: // Print the blocks used for each page of a few individual superblocks. for ( uint32_t i = 0; i < num_sb_id; ++i ) { uint32_t lg_block_size = host_sb_header(sb_id[i]).m_lg_block_size; + if ( lg_block_size != 0 ) { printf( "SB_ID BLOCK ID USED_BLOCKS\n" ); @@ -1249,16 +1272,16 @@ public: #endif printf( " Used blocks: %10u / %10u = %10.6lf\n", used_blocks, total_blocks, - percent_used_blocks ); + percent_used_blocks ); printf( " Used pages: %10u / %10u = %10.6lf\n", used_pages, total_pages, - percent_used_pages ); + percent_used_pages ); printf( " Used SB: %10zu / %10zu = %10.6lf\n", m_num_sb - num_empty_sb, m_num_sb, - percent_used_sb ); + percent_used_sb ); printf( " Active SB: %10u\n", num_active_sb ); printf( " Empty SB: %10u\n", num_empty_sb ); printf( " Partfull SB: %10u\n", num_partfull_sb ); printf( " Full SB: %10lu\n", - m_num_sb - num_active_sb - num_empty_sb - num_partfull_sb ); + m_num_sb - num_active_sb - num_empty_sb - num_partfull_sb ); printf( "Ave. SB Full %%: %10.6lf\n", ave_sb_full ); printf( "\n" ); fflush( stdout ); @@ -1316,6 +1339,8 @@ private: uint32_t lock_sb = Kokkos::atomic_compare_exchange( &m_active(block_size_id), old_sb, SUPERBLOCK_LOCK ); + load_fence(); + // Initialize the new superblock to be the previous one so the previous // superblock is returned if a new superblock can't be found. uint32_t new_sb = lock_sb; @@ -1334,11 +1359,11 @@ private: // size's bitset. unsigned pos = block_size_id * m_ceil_num_sb; - while (!search_done) { + while ( !search_done ) { bool success = false; - unsigned prev_val; + unsigned prev_val = 0; - Kokkos::tie( success, pos ) = m_partfull_sb.reset_any_in_word( pos, prev_val ); + Kokkos::tie( success, prev_val ) = m_partfull_sb.reset_any_in_word( pos ); if ( !success ) { if ( ++tries >= max_tries ) { @@ -1351,22 +1376,21 @@ private: } else { // Found a superblock. + + // It is possible that the newly found superblock is the same as the + // old superblock. In this case putting the old value back in yields + // correct behavior. This could happen as follows. This thread + // grabs the lock and transitions the superblock to the full state. + // Before it searches for a new superblock, other threads perform + // enough deallocations to transition the superblock to the partially + // full state. This thread then searches for a partially full + // superblock and finds the one it removed. There's potential for + // this to cause a performance issue if the same superblock keeps + // being removed and added due to the right mix and ordering of + // allocations and deallocations. search_done = true; new_sb = pos - block_size_id * m_ceil_num_sb; - // Assertions: - // 1. A different superblock than the current should be found. -#ifdef KOKKOS_MEMPOOL_PRINTERR - if ( new_sb == lock_sb ) { - printf( "\n** MemoryPool::find_superblock() FOUND_SAME_SUPERBLOCK: %u **\n", - new_sb); -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - fflush( stdout ); -#endif - Kokkos::abort( "" ); - } -#endif - // Set the head status for the superblock. volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(true) ); @@ -1376,7 +1400,7 @@ private: volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(false) ); } - memory_fence(); + store_fence(); } } @@ -1389,11 +1413,11 @@ private: // size's bitset. pos = 0; - while (!search_done) { + while ( !search_done ) { bool success = false; - unsigned prev_val; + unsigned prev_val = 0; - Kokkos::tie( success, pos ) = m_empty_sb.reset_any_in_word( pos, prev_val ); + Kokkos::tie( success, prev_val ) = m_empty_sb.reset_any_in_word( pos ); if ( !success ) { if ( ++tries >= max_tries ) { @@ -1406,22 +1430,22 @@ private: } else { // Found a superblock. + + // It is possible that the newly found superblock is the same as + // the old superblock. In this case putting the old value back in + // yields correct behavior. This could happen as follows. This + // thread grabs the lock and transitions the superblock to the full + // state. Before it searches for a new superblock, other threads + // perform enough deallocations to transition the superblock to the + // partially full state and then the empty state. This thread then + // searches for a partially full superblock and none exist. This + // thread then searches for an empty superblock and finds the one + // it removed. The likelihood of this happening is so remote that + // the potential for this to cause a performance issue is + // infinitesimal. search_done = true; new_sb = pos; - // Assertions: - // 1. A different superblock than the current should be found. -#ifdef KOKKOS_MEMPOOL_PRINTERR - if ( new_sb == lock_sb ) { - printf( "\n** MemoryPool::find_superblock() FOUND_SAME_SUPERBLOCK: %u **\n", - new_sb); -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - fflush( stdout ); -#endif - Kokkos::abort( "" ); - } -#endif - // Set the empty pages, block size, and head status for the // superblock. volatile_store( &m_sb_header(new_sb).m_empty_pages, @@ -1436,7 +1460,7 @@ private: volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(false) ); } - memory_fence(); + store_fence(); } } } @@ -1445,14 +1469,17 @@ private: atomic_exchange( &m_active(block_size_id), new_sb ); } else { - // Either another thread has the lock and is switching the active superblock for - // this block size or another thread has already changed the active superblock - // since this thread read its value. Keep reading the active superblock until - // it isn't locked to get the new active superblock. + // Either another thread has the lock and is switching the active + // superblock for this block size or another thread has already changed + // the active superblock since this thread read its value. Keep + // atomically reading the active superblock until it isn't locked to get + // the new active superblock. do { - new_sb = volatile_load( &m_active(block_size_id) ); + new_sb = atomic_fetch_or( &m_active(block_size_id), uint32_t(0) ); } while ( new_sb == SUPERBLOCK_LOCK ); + load_fence(); + // Assertions: // 1. An invalid superblock should never be found here. // 2. If the new superblock is the same as the previous superblock, the @@ -1477,14 +1504,25 @@ private: { #if defined( __CUDA_ARCH__ ) // Return value of 64-bit hi-res clock register. - return clock64(); + return clock64(); #elif defined( __i386__ ) || defined( __x86_64 ) // Return value of 64-bit hi-res clock register. - unsigned a, d; - __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); - return ( (uint64_t) a) | ( ( (uint64_t) d ) << 32 ); + unsigned a = 0, d = 0; + + __asm__ volatile( "rdtsc" : "=a" (a), "=d" (d) ); + + return ( (uint64_t) a ) | ( ( (uint64_t) d ) << 32 ); +#elif defined( __powerpc ) || defined( __powerpc__ ) || defined( __powerpc64__ ) || \ + defined( __POWERPC__ ) || defined( __ppc__ ) || defined( __ppc64__ ) + unsigned int cycles = 0; + + asm volatile( "mftb %0" : "=r" (cycles) ); + + return (uint64_t) cycles; #else - const uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + const uint64_t ticks = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); + return ticks; #endif } @@ -1517,7 +1555,4 @@ private: #undef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO #endif -#undef KOKKOS_MEMPOOL_SB_FULL_FRACTION -#undef KOKKOS_MEMPOOL_PAGE_FULL_FRACTION - #endif // KOKKOS_MEMORYPOOL_HPP diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp index 5ee1f16fec..94b58b8aff 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp @@ -63,6 +63,8 @@ enum MemoryTraitsFlags { Unmanaged = 0x01 , RandomAccess = 0x02 , Atomic = 0x04 + , Restrict = 0x08 + , Aligned = 0x10 }; template < unsigned T > @@ -73,6 +75,8 @@ struct MemoryTraits { enum { Unmanaged = T & unsigned(Kokkos::Unmanaged) }; enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) }; enum { Atomic = T & unsigned(Kokkos::Atomic) }; + enum { Restrict = T & unsigned(Kokkos::Restrict) }; + enum { Aligned = T & unsigned(Kokkos::Aligned) }; }; diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp index 7be4f8245f..0e6c6d84fe 100644 --- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp +++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp @@ -58,7 +58,7 @@ #endif #include #include -#include +#include #include #include @@ -160,6 +160,17 @@ public: namespace Kokkos { namespace Impl { +template<> +struct MemorySpaceAccess + < Kokkos::OpenMP::memory_space + , Kokkos::OpenMP::scratch_memory_space + > +{ + enum { assignable = false }; + enum { accessible = true }; + enum { deepcopy = false }; +}; + template<> struct VerifyExecutionCanAccessMemorySpace < Kokkos::OpenMP::memory_space diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index 695bc79a1a..3a73e8a817 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -53,7 +53,8 @@ struct is_reducer_type { template struct is_reducer_type::value + std::is_same::type, + typename std::remove_cv::type>::value >::type> { enum { value = 1 }; }; @@ -726,6 +727,119 @@ public: } }; +template +struct MinMaxScalar { + Scalar min_val,max_val; + + KOKKOS_INLINE_FUNCTION + void operator = (const MinMaxScalar& rhs) { + min_val = rhs.min_val; + max_val = rhs.max_val; + } + + KOKKOS_INLINE_FUNCTION + void operator = (const volatile MinMaxScalar& rhs) volatile { + min_val = rhs.min_val; + max_val = rhs.max_val; + } +}; + +template +struct MinMax { +private: + typedef typename std::remove_cv::type scalar_type; + +public: + //Required + typedef MinMax reducer_type; + typedef MinMaxScalar value_type; + + typedef Kokkos::View > result_view_type; + + scalar_type min_init_value; + scalar_type max_init_value; + +private: + result_view_type result; + + template::value > + struct MinInitWrapper; + + template + struct MinInitWrapper { + static ValueType value() { + return std::numeric_limits::max(); + } + }; + + template + struct MinInitWrapper { + static ValueType value() { + return scalar_type(); + } + }; + + template::value > + struct MaxInitWrapper; + + template + struct MaxInitWrapper { + static ValueType value() { + return std::numeric_limits::min(); + } + }; + + template + struct MaxInitWrapper { + static ValueType value() { + return scalar_type(); + } + }; + +public: + + MinMax(value_type& result_): + min_init_value(MinInitWrapper::value()),max_init_value(MaxInitWrapper::value()),result(&result_) {} + MinMax(const result_view_type& result_): + min_init_value(MinInitWrapper::value()),max_init_value(MaxInitWrapper::value()),result(result_) {} + MinMax(value_type& result_, const scalar_type& min_init_value_, const scalar_type& max_init_value_): + min_init_value(min_init_value_),max_init_value(max_init_value_),result(&result_) {} + MinMax(const result_view_type& result_, const scalar_type& min_init_value_, const scalar_type& max_init_value_): + min_init_value(min_init_value_),max_init_value(max_init_value_),result(result_) {} + + //Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if ( src.min_val < dest.min_val ) { + dest.min_val = src.min_val; + } + if ( src.max_val > dest.max_val ) { + dest.max_val = src.max_val; + } + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if ( src.min_val < dest.min_val ) { + dest.min_val = src.min_val; + } + if ( src.max_val > dest.max_val ) { + dest.max_val = src.max_val; + } + } + + //Optional + KOKKOS_INLINE_FUNCTION + void init( value_type& val) const { + val.min_val = min_init_value; + val.max_val = max_init_value; + } + + result_view_type result_view() const { + return result; + } +}; + template struct MinMaxLocScalar { Scalar min_val,max_val; @@ -1124,7 +1238,8 @@ void parallel_reduce(const PolicyType& policy, typename Impl::enable_if< Kokkos::Impl::is_execution_policy::value >::type * = 0) { - Impl::ParallelReduceAdaptor::execute("",policy,functor,return_value); + ReturnType return_value_impl = return_value; + Impl::ParallelReduceAdaptor::execute("",policy,functor,return_value_impl); } template< class FunctorType, class ReturnType > @@ -1133,8 +1248,8 @@ void parallel_reduce(const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { typedef typename Impl::ParallelReducePolicyType::policy_type policy_type; - - Impl::ParallelReduceAdaptor::execute("",policy_type(0,policy),functor,return_value); + ReturnType return_value_impl = return_value; + Impl::ParallelReduceAdaptor::execute("",policy_type(0,policy),functor,return_value_impl); } template< class FunctorType, class ReturnType > @@ -1144,7 +1259,8 @@ void parallel_reduce(const std::string& label, const FunctorType& functor, const ReturnType& return_value) { typedef typename Impl::ParallelReducePolicyType::policy_type policy_type; - Impl::ParallelReduceAdaptor::execute(label,policy_type(0,policy),functor,return_value); + ReturnType return_value_impl = return_value; + Impl::ParallelReduceAdaptor::execute(label,policy_type(0,policy),functor,return_value_impl); } // No Return Argument diff --git a/lib/kokkos/core/src/Kokkos_Qthread.hpp b/lib/kokkos/core/src/Kokkos_Qthread.hpp index d61f8d518e..c58518b065 100644 --- a/lib/kokkos/core/src/Kokkos_Qthread.hpp +++ b/lib/kokkos/core/src/Kokkos_Qthread.hpp @@ -144,6 +144,17 @@ public: namespace Kokkos { namespace Impl { +template<> +struct MemorySpaceAccess + < Kokkos::Qthread::memory_space + , Kokkos::Qthread::scratch_memory_space + > +{ + enum { assignable = false }; + enum { accessible = true }; + enum { deepcopy = false }; +}; + template<> struct VerifyExecutionCanAccessMemorySpace < Kokkos::Qthread::memory_space diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp index 233b56c939..914edbc7c4 100644 --- a/lib/kokkos/core/src/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Kokkos_Serial.hpp @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include #include @@ -59,7 +59,6 @@ #include #include - #include #if defined( KOKKOS_HAVE_SERIAL ) @@ -192,6 +191,17 @@ public: namespace Kokkos { namespace Impl { +template<> +struct MemorySpaceAccess + < Kokkos::Serial::memory_space + , Kokkos::Serial::scratch_memory_space + > +{ + enum { assignable = false }; + enum { accessible = true }; + enum { deepcopy = false }; +}; + template<> struct VerifyExecutionCanAccessMemorySpace < Kokkos::Serial::memory_space @@ -250,7 +260,6 @@ public: const scratch_memory_space & thread_scratch(int) const { return m_space ; } - KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; } KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; } KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; } @@ -306,10 +315,9 @@ public: } // namespace Impl - /* * < Kokkos::Serial , WorkArgTag > - * < WorkArgTag , Impl::enable_if< Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type > + * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type > * */ namespace Impl { @@ -402,7 +410,6 @@ public: , m_chunk_size ( 32 ) {} - inline int chunk_size() const { return m_chunk_size ; } /** \brief set chunk_size to a discrete value*/ @@ -525,7 +532,6 @@ private: const ReducerType m_reducer ; const pointer_type m_result_ptr ; - template< class TagType > inline typename std::enable_if< std::is_same< TagType , void >::value >::type @@ -895,20 +901,22 @@ struct TeamThreadRangeBoundariesStruct { } // namespace Impl -template +template< typename iType > KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count ) { - return Impl::TeamThreadRangeBoundariesStruct(thread,count); + return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, count ); } -template +template< typename iType1, typename iType2 > KOKKOS_INLINE_FUNCTION -Impl::TeamThreadRangeBoundariesStruct -TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & begin , const iType & end ) +Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type, + Impl::SerialTeamMember > +TeamThreadRange( const Impl::SerialTeamMember& thread, const iType1 & begin, const iType2 & end ) { - return Impl::TeamThreadRangeBoundariesStruct(thread,begin,end); + typedef typename std::common_type< iType1, iType2 >::type iType; + return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, iType(begin), iType(end) ); } template @@ -1113,4 +1121,3 @@ void single(const Impl::ThreadSingleStruct& , const Func //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- - diff --git a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp index fc9113b750..05ed5103b8 100644 --- a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp @@ -41,1069 +41,7 @@ //@HEADER */ -// Experimental unified task-data parallel manycore LDRD +// For backward compatibility: -#ifndef KOKKOS_TASKPOLICY_HPP -#define KOKKOS_TASKPOLICY_HPP - -//---------------------------------------------------------------------------- - -#include - -// If compiling with CUDA then must be using CUDA 8 or better -// and use relocateable device code to enable the task policy. -// nvcc relocatable device code option: --relocatable-device-code=true - -#if ( defined( KOKKOS_COMPILER_NVCC ) ) - #if ( 8000 <= CUDA_VERSION ) && \ - defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) - - #define KOKKOS_ENABLE_TASKPOLICY - - #endif -#else - -#define KOKKOS_ENABLE_TASKPOLICY - -#endif - - -#if defined( KOKKOS_ENABLE_TASKPOLICY ) - -//---------------------------------------------------------------------------- - -#include -#include -#include - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -enum TaskType { TaskTeam = Impl::TaskBase::TaskTeam - , TaskSingle = Impl::TaskBase::TaskSingle }; - -enum TaskPriority { TaskHighPriority = 0 - , TaskRegularPriority = 1 - , TaskLowPriority = 2 }; - -template< typename Space > -class TaskPolicy ; - -template< typename Space > -void wait( TaskPolicy< Space > const & ); - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -/*\brief Implementation data for task data management, access, and execution. - * - * CRTP Inheritance structure to allow static_cast from the - * task root type and a task's FunctorType. - * - * TaskBase< Space , ResultType , FunctorType > - * : TaskBase< Space , ResultType , void > - * , FunctorType - * { ... }; - * - * TaskBase< Space , ResultType , void > - * : TaskBase< Space , void , void > - * { ... }; - */ -template< typename Space , typename ResultType , typename FunctorType > -class TaskBase ; - -template< typename Space > -class TaskExec ; - -}} // namespace Kokkos::Impl - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -/** - * - * Future< space > // value_type == void - * Future< value > // space == Default - * Future< value , space > - * - */ -template< typename Arg1 /* = void */ , typename Arg2 /* = void */ > -class Future { -private: - - template< typename > friend class TaskPolicy ; - template< typename , typename > friend class Future ; - template< typename , typename , typename > friend class Impl::TaskBase ; - - enum { Arg1_is_space = Kokkos::Impl::is_space< Arg1 >::value }; - enum { Arg2_is_space = Kokkos::Impl::is_space< Arg2 >::value }; - enum { Arg1_is_value = ! Arg1_is_space && - ! std::is_same< Arg1 , void >::value }; - enum { Arg2_is_value = ! Arg2_is_space && - ! std::is_same< Arg2 , void >::value }; - - static_assert( ! ( Arg1_is_space && Arg2_is_space ) - , "Future cannot be given two spaces" ); - - static_assert( ! ( Arg1_is_value && Arg2_is_value ) - , "Future cannot be given two value types" ); - - using ValueType = - typename std::conditional< Arg1_is_value , Arg1 , - typename std::conditional< Arg2_is_value , Arg2 , void - >::type >::type ; - - using Space = - typename std::conditional< Arg1_is_space , Arg1 , - typename std::conditional< Arg2_is_space , Arg2 , void - >::type >::type ; - - using task_base = Impl::TaskBase< Space , ValueType , void > ; - using queue_type = Impl::TaskQueue< Space > ; - - task_base * m_task ; - - KOKKOS_INLINE_FUNCTION explicit - Future( task_base * task ) : m_task(0) - { if ( task ) queue_type::assign( & m_task , task ); } - - //---------------------------------------- - -public: - - using execution_space = typename Space::execution_space ; - using value_type = ValueType ; - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - bool is_null() const { return 0 == m_task ; } - - KOKKOS_INLINE_FUNCTION - int reference_count() const - { return 0 != m_task ? m_task->reference_count() : 0 ; } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - ~Future() { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - constexpr Future() noexcept : m_task(0) {} - - KOKKOS_INLINE_FUNCTION - Future( Future && rhs ) - : m_task( rhs.m_task ) { rhs.m_task = 0 ; } - - KOKKOS_INLINE_FUNCTION - Future( const Future & rhs ) - : m_task(0) - { if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); } - - KOKKOS_INLINE_FUNCTION - Future & operator = ( Future && rhs ) - { - if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); - m_task = rhs.m_task ; - rhs.m_task = 0 ; - return *this ; - } - - KOKKOS_INLINE_FUNCTION - Future & operator = ( const Future & rhs ) - { - if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); - return *this ; - } - - //---------------------------------------- - - template< class A1 , class A2 > - KOKKOS_INLINE_FUNCTION - Future( Future && rhs ) - : m_task( rhs.m_task ) - { - static_assert - ( std::is_same< Space , void >::value || - std::is_same< Space , typename Future::Space >::value - , "Assigned Futures must have the same space" ); - - static_assert - ( std::is_same< value_type , void >::value || - std::is_same< value_type , typename Future::value_type >::value - , "Assigned Futures must have the same value_type" ); - - rhs.m_task = 0 ; - } - - template< class A1 , class A2 > - KOKKOS_INLINE_FUNCTION - Future( const Future & rhs ) - : m_task(0) - { - static_assert - ( std::is_same< Space , void >::value || - std::is_same< Space , typename Future::Space >::value - , "Assigned Futures must have the same space" ); - - static_assert - ( std::is_same< value_type , void >::value || - std::is_same< value_type , typename Future::value_type >::value - , "Assigned Futures must have the same value_type" ); - - if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); - } - - template< class A1 , class A2 > - KOKKOS_INLINE_FUNCTION - Future & operator = ( const Future & rhs ) - { - static_assert - ( std::is_same< Space , void >::value || - std::is_same< Space , typename Future::Space >::value - , "Assigned Futures must have the same space" ); - - static_assert - ( std::is_same< value_type , void >::value || - std::is_same< value_type , typename Future::value_type >::value - , "Assigned Futures must have the same value_type" ); - - if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); - return *this ; - } - - template< class A1 , class A2 > - KOKKOS_INLINE_FUNCTION - Future & operator = ( Future && rhs ) - { - static_assert - ( std::is_same< Space , void >::value || - std::is_same< Space , typename Future::Space >::value - , "Assigned Futures must have the same space" ); - - static_assert - ( std::is_same< value_type , void >::value || - std::is_same< value_type , typename Future::value_type >::value - , "Assigned Futures must have the same value_type" ); - - if ( m_task ) queue_type::assign( & m_task , (task_base*) 0 ); - m_task = rhs.m_task ; - rhs.m_task = 0 ; - return *this ; - } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - typename task_base::get_return_type - get() const - { - if ( 0 == m_task ) { - Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()"); - } - return m_task->get(); - } -}; - -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template< typename ExecSpace > -class TaskPolicy -{ -private: - - using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ; - using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ; - using task_base = Impl::TaskBase< ExecSpace , void , void > ; - - track_type m_track ; - queue_type * m_queue ; - - //---------------------------------------- - // Process optional arguments to spawn and respawn functions - - KOKKOS_INLINE_FUNCTION static - void assign( task_base * const ) {} - - // TaskTeam or TaskSingle - template< typename ... Options > - KOKKOS_INLINE_FUNCTION static - void assign( task_base * const task - , TaskType const & arg - , Options const & ... opts ) - { - task->m_task_type = arg ; - assign( task , opts ... ); - } - - // TaskHighPriority or TaskRegularPriority or TaskLowPriority - template< typename ... Options > - KOKKOS_INLINE_FUNCTION static - void assign( task_base * const task - , TaskPriority const & arg - , Options const & ... opts ) - { - task->m_priority = arg ; - assign( task , opts ... ); - } - - // Future for a dependence - template< typename A1 , typename A2 , typename ... Options > - KOKKOS_INLINE_FUNCTION static - void assign( task_base * const task - , Future< A1 , A2 > const & arg - , Options const & ... opts ) - { - // Assign dependence to task->m_next - // which will be processed within subsequent call to schedule. - // Error if the dependence is reset. - - if ( 0 != Kokkos::atomic_exchange(& task->m_next, arg.m_task) ) { - Kokkos::abort("TaskPolicy ERROR: resetting task dependence"); - } - - if ( 0 != arg.m_task ) { - // The future may be destroyed upon returning from this call - // so increment reference count to track this assignment. - Kokkos::atomic_fetch_add( &(arg.m_task->m_ref_count) , 1 ); - } - - assign( task , opts ... ); - } - - //---------------------------------------- - -public: - - using execution_policy = TaskPolicy ; - using execution_space = ExecSpace ; - using memory_space = typename queue_type::memory_space ; - using member_type = Kokkos::Impl::TaskExec< ExecSpace > ; - - KOKKOS_INLINE_FUNCTION - TaskPolicy() : m_track(), m_queue(0) {} - - KOKKOS_INLINE_FUNCTION - TaskPolicy( TaskPolicy && rhs ) = default ; - - KOKKOS_INLINE_FUNCTION - TaskPolicy( TaskPolicy const & rhs ) = default ; - - KOKKOS_INLINE_FUNCTION - TaskPolicy & operator = ( TaskPolicy && rhs ) = default ; - - KOKKOS_INLINE_FUNCTION - TaskPolicy & operator = ( TaskPolicy const & rhs ) = default ; - - TaskPolicy( memory_space const & arg_memory_space - , unsigned const arg_memory_pool_capacity - , unsigned const arg_memory_pool_log2_superblock = 12 ) - : m_track() - , m_queue(0) - { - typedef Kokkos::Experimental::Impl::SharedAllocationRecord - < memory_space , typename queue_type::Destroy > - record_type ; - - record_type * record = - record_type::allocate( arg_memory_space - , "TaskQueue" - , sizeof(queue_type) - ); - - m_queue = new( record->data() ) - queue_type( arg_memory_space - , arg_memory_pool_capacity - , arg_memory_pool_log2_superblock ); - - record->m_destroy.m_queue = m_queue ; - - m_track.assign_allocated_record_to_uninitialized( record ); - } - - //---------------------------------------- - /**\brief Allocation size for a spawned task */ - template< typename FunctorType > - KOKKOS_FUNCTION - size_t spawn_allocation_size() const - { - using task_type = Impl::TaskBase< execution_space - , typename FunctorType::value_type - , FunctorType > ; - - return m_queue->allocate_block_size( sizeof(task_type) ); - } - - /**\brief Allocation size for a when_all aggregate */ - KOKKOS_FUNCTION - size_t when_all_allocation_size( int narg ) const - { - using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ; - - return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) ); - } - - //---------------------------------------- - - /**\brief A task spawns a task with options - * - * 1) High, Normal, or Low priority - * 2) With or without dependence - * 3) Team or Serial - */ - template< typename FunctorType , typename ... Options > - KOKKOS_FUNCTION - Future< typename FunctorType::value_type , ExecSpace > - task_spawn( FunctorType const & arg_functor - , Options const & ... arg_options - ) const - { - using value_type = typename FunctorType::value_type ; - using future_type = Future< value_type , execution_space > ; - using task_type = Impl::TaskBase< execution_space - , value_type - , FunctorType > ; - - //---------------------------------------- - // Give single-thread back-ends an opportunity to clear - // queue of ready tasks before allocating a new task - - m_queue->iff_single_thread_recursive_execute(); - - //---------------------------------------- - - future_type f ; - - // Allocate task from memory pool - f.m_task = - reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type))); - - if ( f.m_task ) { - - // Placement new construction - new ( f.m_task ) task_type( arg_functor ); - - // Reference count starts at two - // +1 for matching decrement when task is complete - // +1 for future - f.m_task->m_queue = m_queue ; - f.m_task->m_ref_count = 2 ; - f.m_task->m_alloc_size = sizeof(task_type); - - assign( f.m_task , arg_options... ); - - // Spawning from within the execution space so the - // apply function pointer is guaranteed to be valid - f.m_task->m_apply = task_type::apply ; - - m_queue->schedule( f.m_task ); - // this task may be updated or executed at any moment - } - - return f ; - } - - /**\brief The host process spawns a task with options - * - * 1) High, Normal, or Low priority - * 2) With or without dependence - * 3) Team or Serial - */ - template< typename FunctorType , typename ... Options > - inline - Future< typename FunctorType::value_type , ExecSpace > - host_spawn( FunctorType const & arg_functor - , Options const & ... arg_options - ) const - { - using value_type = typename FunctorType::value_type ; - using future_type = Future< value_type , execution_space > ; - using task_type = Impl::TaskBase< execution_space - , value_type - , FunctorType > ; - - future_type f ; - - // Allocate task from memory pool - f.m_task = - reinterpret_cast( m_queue->allocate(sizeof(task_type)) ); - - if ( f.m_task ) { - - // Placement new construction - new( f.m_task ) task_type( arg_functor ); - - // Reference count starts at two: - // +1 to match decrement when task completes - // +1 for the future - f.m_task->m_queue = m_queue ; - f.m_task->m_ref_count = 2 ; - f.m_task->m_alloc_size = sizeof(task_type); - - assign( f.m_task , arg_options... ); - - // Potentially spawning outside execution space so the - // apply function pointer must be obtained from execution space. - // Required for Cuda execution space function pointer. - queue_type::specialization::template - proc_set_apply< FunctorType >( & f.m_task->m_apply ); - - m_queue->schedule( f.m_task ); - } - return f ; - } - - /**\brief Return a future that is complete - * when all input futures are complete. - */ - template< typename A1 , typename A2 > - KOKKOS_FUNCTION - Future< ExecSpace > - when_all( int narg , Future< A1 , A2 > const * const arg ) const - { - static_assert - ( std::is_same< execution_space - , typename Future< A1 , A2 >::execution_space - >::value - , "Future must have same execution space" ); - - using future_type = Future< ExecSpace > ; - using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ; - - future_type f ; - - size_t const size = sizeof(task_base) + narg * sizeof(task_base*); - - f.m_task = - reinterpret_cast< task_base * >( m_queue->allocate( size ) ); - - if ( f.m_task ) { - - new( f.m_task ) task_base(); - - // Reference count starts at two: - // +1 to match decrement when task completes - // +1 for the future - f.m_task->m_queue = m_queue ; - f.m_task->m_ref_count = 2 ; - f.m_task->m_alloc_size = size ; - f.m_task->m_dep_count = narg ; - f.m_task->m_task_type = task_base::Aggregate ; - - task_base ** const dep = f.m_task->aggregate_dependences(); - - // Assign dependences to increment their reference count - // The futures may be destroyed upon returning from this call - // so increment reference count to track this assignment. - - for ( int i = 0 ; i < narg ; ++i ) { - task_base * const t = dep[i] = arg[i].m_task ; - if ( 0 != t ) { - Kokkos::atomic_fetch_add( &(t->m_ref_count) , 1 ); - } - } - - m_queue->schedule( f.m_task ); - // this when_all may be processed at any moment - } - - return f ; - } - - /**\brief An executing task respawns itself with options - * - * 1) High, Normal, or Low priority - * 2) With or without dependence - */ - template< class FunctorType , typename ... Options > - KOKKOS_FUNCTION - void respawn( FunctorType * task_self - , Options const & ... arg_options ) const - { - using value_type = typename FunctorType::value_type ; - using task_type = Impl::TaskBase< execution_space - , value_type - , FunctorType > ; - - task_base * const zero = (task_base *) 0 ; - task_base * const lock = (task_base *) task_base::LockTag ; - task_type * const task = static_cast< task_type * >( task_self ); - - // Precondition: - // task is in Executing state - // therefore m_next == LockTag - // - // Change to m_next == 0 for no dependence - - if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) { - Kokkos::abort("TaskPolicy::respawn ERROR: already respawned"); - } - - assign( task , arg_options... ); - - // Postcondition: - // task is in Executing-Respawn state - // therefore m_next == dependece or 0 - } - - //---------------------------------------- - - template< typename S > - friend - void Kokkos::wait( Kokkos::TaskPolicy< S > const & ); - - //---------------------------------------- - - inline - int allocation_capacity() const noexcept - { return m_queue->m_memory.get_mem_size(); } - - KOKKOS_INLINE_FUNCTION - int allocated_task_count() const noexcept - { return m_queue->m_count_alloc ; } - - KOKKOS_INLINE_FUNCTION - int allocated_task_count_max() const noexcept - { return m_queue->m_max_alloc ; } - - KOKKOS_INLINE_FUNCTION - long allocated_task_count_accum() const noexcept - { return m_queue->m_accum_alloc ; } - -}; - -template< typename ExecSpace > -inline -void wait( TaskPolicy< ExecSpace > const & policy ) -{ policy.m_queue->execute(); } - -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -struct FutureValueTypeIsVoidError {}; - -template < class ExecSpace , class ResultType , class FunctorType > -class TaskMember ; - -} /* namespace Impl */ -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -/**\brief States of a task */ -enum TaskState - { TASK_STATE_NULL = 0 ///< Does not exist - , TASK_STATE_CONSTRUCTING = 1 ///< Is under construction - , TASK_STATE_WAITING = 2 ///< Is waiting for execution - , TASK_STATE_EXECUTING = 4 ///< Is executing - , TASK_STATE_COMPLETE = 8 ///< Execution is complete - }; - -/**\brief Tag for Future - */ -struct Latch {}; - -/** - * - * Future< space > // value_type == void - * Future< value > // space == Default - * Future< value , space > - * - */ -template< class Arg1 = void , class Arg2 = void > -class Future { -private: - - template< class , class , class > friend class Impl::TaskMember ; - template< class > friend class TaskPolicy ; - template< class , class > friend class Future ; - - // Argument #2, if not void, must be the space. - enum { Arg1_is_space = Kokkos::Impl::is_execution_space< Arg1 >::value }; - enum { Arg2_is_space = Kokkos::Impl::is_execution_space< Arg2 >::value }; - enum { Arg2_is_void = std::is_same< Arg2 , void >::value }; - - struct ErrorNoExecutionSpace {}; - - enum { Opt1 = Arg1_is_space && Arg2_is_void - , Opt2 = ! Arg1_is_space && Arg2_is_void - , Opt3 = ! Arg1_is_space && Arg2_is_space - , OptOK = Kokkos::Impl::StaticAssert< Opt1 || Opt2 || Opt3 , ErrorNoExecutionSpace >::value - }; - - typedef typename - Kokkos::Impl::if_c< Opt2 || Opt3 , Arg1 , void >::type - ValueType ; - - typedef typename - Kokkos::Impl::if_c< Opt1 , Arg1 , typename - Kokkos::Impl::if_c< Opt2 , Kokkos::DefaultExecutionSpace , typename - Kokkos::Impl::if_c< Opt3 , Arg2 , void - >::type >::type >::type - ExecutionSpace ; - - typedef Impl::TaskMember< ExecutionSpace , void , void > TaskRoot ; - typedef Impl::TaskMember< ExecutionSpace , ValueType , void > TaskValue ; - - TaskRoot * m_task ; - - KOKKOS_INLINE_FUNCTION explicit - Future( TaskRoot * task ) - : m_task(0) - { TaskRoot::assign( & m_task , TaskRoot::template verify_type< ValueType >( task ) ); } - - //---------------------------------------- - -public: - - typedef ValueType value_type; - typedef ExecutionSpace execution_space ; - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - TaskState get_task_state() const - { return 0 != m_task ? m_task->get_state() : TASK_STATE_NULL ; } - - KOKKOS_INLINE_FUNCTION - bool is_null() const { return 0 == m_task ; } - - KOKKOS_INLINE_FUNCTION - int reference_count() const - { return 0 != m_task ? m_task->reference_count() : 0 ; } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - ~Future() { TaskRoot::assign( & m_task , 0 ); } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - Future() : m_task(0) {} - - KOKKOS_INLINE_FUNCTION - Future( const Future & rhs ) - : m_task(0) - { TaskRoot::assign( & m_task , rhs.m_task ); } - - KOKKOS_INLINE_FUNCTION - Future & operator = ( const Future & rhs ) - { TaskRoot::assign( & m_task , rhs.m_task ); return *this ; } - - //---------------------------------------- - - template< class A1 , class A2 > - KOKKOS_INLINE_FUNCTION - Future( const Future & rhs ) - : m_task(0) - { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); } - - template< class A1 , class A2 > - KOKKOS_INLINE_FUNCTION - Future & operator = ( const Future & rhs ) - { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); return *this ; } - - //---------------------------------------- - - typedef typename TaskValue::get_result_type get_result_type ; - - KOKKOS_INLINE_FUNCTION - get_result_type get() const - { - if ( 0 == m_task ) { - Kokkos::abort( "Kokkos::Experimental::Future::get ERROR: is_null()"); - } - return static_cast( m_task )->get(); - } - - //---------------------------------------- -}; - -template< class Arg2 > -class Future< Latch , Arg2 > { -private: - - template< class , class , class > friend class Impl::TaskMember ; - template< class > friend class TaskPolicy ; - template< class , class > friend class Future ; - - // Argument #2, if not void, must be the space. - enum { Arg2_is_space = Kokkos::Impl::is_execution_space< Arg2 >::value }; - enum { Arg2_is_void = std::is_same< Arg2 , void >::value }; - - static_assert( Arg2_is_space || Arg2_is_void - , "Future template argument #2 must be a space" ); - - typedef typename - std::conditional< Arg2_is_space , Arg2 , Kokkos::DefaultExecutionSpace > - ::type ExecutionSpace ; - - typedef Impl::TaskMember< ExecutionSpace , void , void > TaskRoot ; - - TaskRoot * m_task ; - - KOKKOS_INLINE_FUNCTION explicit - Future( TaskRoot * task ) - : m_task(0) - { TaskRoot::assign( & m_task , task ); } - - //---------------------------------------- - -public: - - typedef void value_type; - typedef ExecutionSpace execution_space ; - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - void add( const int k ) const - { if ( 0 != m_task ) m_task->latch_add(k); } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - TaskState get_task_state() const - { return 0 != m_task ? m_task->get_state() : TASK_STATE_NULL ; } - - KOKKOS_INLINE_FUNCTION - bool is_null() const { return 0 == m_task ; } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - ~Future() { TaskRoot::assign( & m_task , 0 ); } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION - Future() : m_task(0) {} - - KOKKOS_INLINE_FUNCTION - Future( const Future & rhs ) - : m_task(0) - { TaskRoot::assign( & m_task , rhs.m_task ); } - - KOKKOS_INLINE_FUNCTION - Future & operator = ( const Future & rhs ) - { TaskRoot::assign( & m_task , rhs.m_task ); return *this ; } - - //---------------------------------------- - - typedef void get_result_type ; - - KOKKOS_INLINE_FUNCTION - void get() const {} - - //---------------------------------------- - -}; - -namespace Impl { - -template< class T > -struct is_future : public std::false_type {}; - -template< class Arg0 , class Arg1 > -struct is_future< Kokkos::Experimental::Future > - : public std::true_type {}; - -} /* namespace Impl */ -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { - -/** \brief If the argument is an execution space then a serial task in that space */ -template< class Arg0 = Kokkos::DefaultExecutionSpace > -class TaskPolicy { -public: - - typedef typename Arg0::execution_space execution_space ; - - //---------------------------------------- - - TaskPolicy - ( const unsigned arg_task_max_count - , const unsigned arg_task_max_size - , const unsigned arg_task_default_dependence_capacity = 4 - , const unsigned arg_task_team_size = 0 /* choose default */ - ); - - TaskPolicy() = default ; - TaskPolicy( TaskPolicy && rhs ) = default ; - TaskPolicy( const TaskPolicy & rhs ) = default ; - TaskPolicy & operator = ( TaskPolicy && rhs ) = default ; - TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ; - - //---------------------------------------- - /** \brief Create a serial task with storage for dependences. - * - * Postcondition: Task is in the 'constructing' state. - */ - template< class FunctorType > - Future< typename FunctorType::value_type , execution_space > - create( const FunctorType & functor - , const unsigned dependence_capacity /* = default */ ); - - template< class FunctorType > - KOKKOS_INLINE_FUNCTION - Future< typename FunctorType::value_type , execution_space > - create_team( const FunctorType & functor - , const unsigned dependence_capacity /* = default */ ); - - /** \brief Set dependence that 'after' cannot start execution - * until 'before' has completed. - * - * Precondition: The 'after' task must be in then 'Constructing' state. - */ - template< class TA , class TB > - void add_dependence( const Future & after - , const Future & before ) const ; - - /** \brief Spawn a task in the 'Constructing' state - * - * Precondition: Task is in the 'constructing' state. - * Postcondition: Task is waiting, executing, or complete. - */ - template< class T > - const Future & - spawn( const Future & ) const ; - - //---------------------------------------- - /** \brief Query dependence of an executing task */ - - template< class FunctorType > - Future< execution_space > - get_dependence( FunctorType * , const int ) const ; - - //---------------------------------------- - /** \brief Clear current dependences of an executing task - * in preparation for setting new dependences and - * respawning. - * - * Precondition: The functor must be a task in the executing state. - */ - template< class FunctorType > - void clear_dependence( FunctorType * ) const ; - - /** \brief Set dependence that 'after' cannot resume execution - * until 'before' has completed. - * - * The 'after' functor must be in the executing state - */ - template< class FunctorType , class TB > - void add_dependence( FunctorType * after - , const Future & before ) const ; - - /** \brief Respawn (reschedule) an executing task to be called again - * after all dependences have completed. - */ - template< class FunctorType > - void respawn( FunctorType * ) const ; -}; - -//---------------------------------------------------------------------------- -/** \brief Create and spawn a single-thread task */ -template< class ExecSpace , class FunctorType > -inline -Future< typename FunctorType::value_type , ExecSpace > -spawn( TaskPolicy & policy , const FunctorType & functor ) -{ return policy.spawn( policy.create( functor ) ); } - -/** \brief Create and spawn a single-thread task with dependences */ -template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 > -inline -Future< typename FunctorType::value_type , ExecSpace > -spawn( TaskPolicy & policy - , const FunctorType & functor - , const Future & before_0 - , const Future & before_1 ) -{ - Future< typename FunctorType::value_type , ExecSpace > f ; - f = policy.create( functor , 2 ); - policy.add_dependence( f , before_0 ); - policy.add_dependence( f , before_1 ); - policy.spawn( f ); - return f ; -} - -//---------------------------------------------------------------------------- -/** \brief Create and spawn a parallel_for task */ -template< class ExecSpace , class ParallelPolicyType , class FunctorType > -inline -Future< typename FunctorType::value_type , ExecSpace > -spawn_foreach( TaskPolicy & task_policy - , const ParallelPolicyType & parallel_policy - , const FunctorType & functor ) -{ return task_policy.spawn( task_policy.create_foreach( parallel_policy , functor ) ); } - -/** \brief Create and spawn a parallel_reduce task */ -template< class ExecSpace , class ParallelPolicyType , class FunctorType > -inline -Future< typename FunctorType::value_type , ExecSpace > -spawn_reduce( TaskPolicy & task_policy - , const ParallelPolicyType & parallel_policy - , const FunctorType & functor ) -{ return task_policy.spawn( task_policy.create_reduce( parallel_policy , functor ) ); } - -//---------------------------------------------------------------------------- -/** \brief Respawn a task functor with dependences */ -template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 > -inline -void respawn( TaskPolicy & policy - , FunctorType * functor - , const Future & before_0 - , const Future & before_1 - ) -{ - policy.clear_dependence( functor ); - policy.add_dependence( functor , before_0 ); - policy.add_dependence( functor , before_1 ); - policy.respawn( functor ); -} - -//---------------------------------------------------------------------------- - -template< class ExecSpace > -void wait( TaskPolicy< ExecSpace > & ); - -} /* namespace Experimental */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ -#endif /* #ifndef KOKKOS_TASKPOLICY_HPP */ +#include diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp new file mode 100644 index 0000000000..0de926aa12 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp @@ -0,0 +1,700 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TASKSCHEDULER_HPP +#define KOKKOS_TASKSCHEDULER_HPP + +//---------------------------------------------------------------------------- + +#include + +// If compiling with CUDA then must be using CUDA 8 or better +// and use relocateable device code to enable the task policy. +// nvcc relocatable device code option: --relocatable-device-code=true + +#if ( defined( KOKKOS_HAVE_CUDA ) ) + #if ( 8000 <= CUDA_VERSION ) && \ + defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) + + #define KOKKOS_ENABLE_TASKDAG + + #endif +#else + #define KOKKOS_ENABLE_TASKDAG +#endif + + +#if defined( KOKKOS_ENABLE_TASKDAG ) + +//---------------------------------------------------------------------------- + +#include +#include + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +// Forward declarations used in Impl::TaskQueue + +template< typename Arg1 = void , typename Arg2 = void > +class Future ; + +template< typename Space > +class TaskScheduler ; + +} // namespace Kokkos + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** + * + * Future< space > // value_type == void + * Future< value > // space == Default + * Future< value , space > + * + */ +template< typename Arg1 , typename Arg2 > +class Future { +private: + + template< typename > friend class TaskScheduler ; + template< typename , typename > friend class Future ; + template< typename , typename , typename > friend class Impl::TaskBase ; + + enum { Arg1_is_space = Kokkos::is_space< Arg1 >::value }; + enum { Arg2_is_space = Kokkos::is_space< Arg2 >::value }; + enum { Arg1_is_value = ! Arg1_is_space && + ! std::is_same< Arg1 , void >::value }; + enum { Arg2_is_value = ! Arg2_is_space && + ! std::is_same< Arg2 , void >::value }; + + static_assert( ! ( Arg1_is_space && Arg2_is_space ) + , "Future cannot be given two spaces" ); + + static_assert( ! ( Arg1_is_value && Arg2_is_value ) + , "Future cannot be given two value types" ); + + using ValueType = + typename std::conditional< Arg1_is_value , Arg1 , + typename std::conditional< Arg2_is_value , Arg2 , void + >::type >::type ; + + using Space = + typename std::conditional< Arg1_is_space , Arg1 , + typename std::conditional< Arg2_is_space , Arg2 , void + >::type >::type ; + + using task_base = Impl::TaskBase< Space , ValueType , void > ; + using queue_type = Impl::TaskQueue< Space > ; + + task_base * m_task ; + + KOKKOS_INLINE_FUNCTION explicit + Future( task_base * task ) : m_task(0) + { if ( task ) queue_type::assign( & m_task , task ); } + + //---------------------------------------- + +public: + + using execution_space = typename Space::execution_space ; + using value_type = ValueType ; + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + bool is_null() const { return 0 == m_task ; } + + KOKKOS_INLINE_FUNCTION + int reference_count() const + { return 0 != m_task ? m_task->reference_count() : 0 ; } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + void clear() + { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + ~Future() { clear(); } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + constexpr Future() noexcept : m_task(0) {} + + KOKKOS_INLINE_FUNCTION + Future( Future && rhs ) + : m_task( rhs.m_task ) { rhs.m_task = 0 ; } + + KOKKOS_INLINE_FUNCTION + Future( const Future & rhs ) + : m_task(0) + { if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); } + + KOKKOS_INLINE_FUNCTION + Future & operator = ( Future && rhs ) + { + clear(); + m_task = rhs.m_task ; + rhs.m_task = 0 ; + return *this ; + } + + KOKKOS_INLINE_FUNCTION + Future & operator = ( const Future & rhs ) + { + if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); + return *this ; + } + + //---------------------------------------- + + template< class A1 , class A2 > + KOKKOS_INLINE_FUNCTION + Future( Future && rhs ) + : m_task( rhs.m_task ) + { + static_assert + ( std::is_same< Space , void >::value || + std::is_same< Space , typename Future::Space >::value + , "Assigned Futures must have the same space" ); + + static_assert + ( std::is_same< value_type , void >::value || + std::is_same< value_type , typename Future::value_type >::value + , "Assigned Futures must have the same value_type" ); + + rhs.m_task = 0 ; + } + + template< class A1 , class A2 > + KOKKOS_INLINE_FUNCTION + Future( const Future & rhs ) + : m_task(0) + { + static_assert + ( std::is_same< Space , void >::value || + std::is_same< Space , typename Future::Space >::value + , "Assigned Futures must have the same space" ); + + static_assert + ( std::is_same< value_type , void >::value || + std::is_same< value_type , typename Future::value_type >::value + , "Assigned Futures must have the same value_type" ); + + if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); + } + + template< class A1 , class A2 > + KOKKOS_INLINE_FUNCTION + Future & operator = ( const Future & rhs ) + { + static_assert + ( std::is_same< Space , void >::value || + std::is_same< Space , typename Future::Space >::value + , "Assigned Futures must have the same space" ); + + static_assert + ( std::is_same< value_type , void >::value || + std::is_same< value_type , typename Future::value_type >::value + , "Assigned Futures must have the same value_type" ); + + if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); + return *this ; + } + + template< class A1 , class A2 > + KOKKOS_INLINE_FUNCTION + Future & operator = ( Future && rhs ) + { + static_assert + ( std::is_same< Space , void >::value || + std::is_same< Space , typename Future::Space >::value + , "Assigned Futures must have the same space" ); + + static_assert + ( std::is_same< value_type , void >::value || + std::is_same< value_type , typename Future::value_type >::value + , "Assigned Futures must have the same value_type" ); + + clear(); + m_task = rhs.m_task ; + rhs.m_task = 0 ; + return *this ; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + typename task_base::get_return_type + get() const + { + if ( 0 == m_task ) { + Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()"); + } + return m_task->get(); + } +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +enum TaskType { TaskTeam = Impl::TaskBase::TaskTeam + , TaskSingle = Impl::TaskBase::TaskSingle }; + +enum TaskPriority { TaskHighPriority = 0 + , TaskRegularPriority = 1 + , TaskLowPriority = 2 }; + +template< typename Space > +void wait( TaskScheduler< Space > const & ); + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + + + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template< typename ExecSpace > +class TaskScheduler +{ +private: + + using track_type = Kokkos::Impl::SharedAllocationTracker ; + using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ; + using task_base = Impl::TaskBase< ExecSpace , void , void > ; + + track_type m_track ; + queue_type * m_queue ; + + //---------------------------------------- + // Process optional arguments to spawn and respawn functions + + KOKKOS_INLINE_FUNCTION static + void assign( task_base * const ) {} + + // TaskTeam or TaskSingle + template< typename ... Options > + KOKKOS_INLINE_FUNCTION static + void assign( task_base * const task + , TaskType const & arg + , Options const & ... opts ) + { + task->m_task_type = arg ; + assign( task , opts ... ); + } + + // TaskHighPriority or TaskRegularPriority or TaskLowPriority + template< typename ... Options > + KOKKOS_INLINE_FUNCTION static + void assign( task_base * const task + , TaskPriority const & arg + , Options const & ... opts ) + { + task->m_priority = arg ; + assign( task , opts ... ); + } + + // Future for a dependence + template< typename A1 , typename A2 , typename ... Options > + KOKKOS_INLINE_FUNCTION static + void assign( task_base * const task + , Future< A1 , A2 > const & arg + , Options const & ... opts ) + { + // Assign dependence to task->m_next + // which will be processed within subsequent call to schedule. + // Error if the dependence is reset. + + if ( 0 != Kokkos::atomic_exchange(& task->m_next, arg.m_task) ) { + Kokkos::abort("TaskScheduler ERROR: resetting task dependence"); + } + + if ( 0 != arg.m_task ) { + // The future may be destroyed upon returning from this call + // so increment reference count to track this assignment. + Kokkos::atomic_increment( &(arg.m_task->m_ref_count) ); + } + + assign( task , opts ... ); + } + + //---------------------------------------- + +public: + + using execution_policy = TaskScheduler ; + using execution_space = ExecSpace ; + using memory_space = typename queue_type::memory_space ; + using member_type = Kokkos::Impl::TaskExec< ExecSpace > ; + + KOKKOS_INLINE_FUNCTION + TaskScheduler() : m_track(), m_queue(0) {} + + KOKKOS_INLINE_FUNCTION + TaskScheduler( TaskScheduler && rhs ) = default ; + + KOKKOS_INLINE_FUNCTION + TaskScheduler( TaskScheduler const & rhs ) = default ; + + KOKKOS_INLINE_FUNCTION + TaskScheduler & operator = ( TaskScheduler && rhs ) = default ; + + KOKKOS_INLINE_FUNCTION + TaskScheduler & operator = ( TaskScheduler const & rhs ) = default ; + + TaskScheduler( memory_space const & arg_memory_space + , unsigned const arg_memory_pool_capacity + , unsigned const arg_memory_pool_log2_superblock = 12 ) + : m_track() + , m_queue(0) + { + typedef Kokkos::Impl::SharedAllocationRecord + < memory_space , typename queue_type::Destroy > + record_type ; + + record_type * record = + record_type::allocate( arg_memory_space + , "TaskQueue" + , sizeof(queue_type) + ); + + m_queue = new( record->data() ) + queue_type( arg_memory_space + , arg_memory_pool_capacity + , arg_memory_pool_log2_superblock ); + + record->m_destroy.m_queue = m_queue ; + + m_track.assign_allocated_record_to_uninitialized( record ); + } + + //---------------------------------------- + /**\brief Allocation size for a spawned task */ + template< typename FunctorType > + KOKKOS_FUNCTION + size_t spawn_allocation_size() const + { + using task_type = Impl::TaskBase< execution_space + , typename FunctorType::value_type + , FunctorType > ; + + return m_queue->allocate_block_size( sizeof(task_type) ); + } + + /**\brief Allocation size for a when_all aggregate */ + KOKKOS_FUNCTION + size_t when_all_allocation_size( int narg ) const + { + using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ; + + return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) ); + } + + //---------------------------------------- + + /**\brief A task spawns a task with options + * + * 1) High, Normal, or Low priority + * 2) With or without dependence + * 3) Team or Serial + */ + template< typename FunctorType , typename ... Options > + KOKKOS_FUNCTION + Future< typename FunctorType::value_type , ExecSpace > + task_spawn( FunctorType const & arg_functor + , Options const & ... arg_options + ) const + { + using value_type = typename FunctorType::value_type ; + using future_type = Future< value_type , execution_space > ; + using task_type = Impl::TaskBase< execution_space + , value_type + , FunctorType > ; + + //---------------------------------------- + // Give single-thread back-ends an opportunity to clear + // queue of ready tasks before allocating a new task + + m_queue->iff_single_thread_recursive_execute(); + + //---------------------------------------- + + future_type f ; + + // Allocate task from memory pool + f.m_task = + reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type))); + + if ( f.m_task ) { + + // Placement new construction + new ( f.m_task ) task_type( arg_functor ); + + // Reference count starts at two + // +1 for matching decrement when task is complete + // +1 for future + f.m_task->m_queue = m_queue ; + f.m_task->m_ref_count = 2 ; + f.m_task->m_alloc_size = sizeof(task_type); + + assign( f.m_task , arg_options... ); + + // Spawning from within the execution space so the + // apply function pointer is guaranteed to be valid + f.m_task->m_apply = task_type::apply ; + + m_queue->schedule( f.m_task ); + // this task may be updated or executed at any moment + } + + return f ; + } + + /**\brief The host process spawns a task with options + * + * 1) High, Normal, or Low priority + * 2) With or without dependence + * 3) Team or Serial + */ + template< typename FunctorType , typename ... Options > + inline + Future< typename FunctorType::value_type , ExecSpace > + host_spawn( FunctorType const & arg_functor + , Options const & ... arg_options + ) const + { + using value_type = typename FunctorType::value_type ; + using future_type = Future< value_type , execution_space > ; + using task_type = Impl::TaskBase< execution_space + , value_type + , FunctorType > ; + + if ( m_queue == 0 ) { + Kokkos::abort("Kokkos::TaskScheduler not initialized"); + } + + future_type f ; + + // Allocate task from memory pool + f.m_task = + reinterpret_cast( m_queue->allocate(sizeof(task_type)) ); + + if ( f.m_task ) { + + // Placement new construction + new( f.m_task ) task_type( arg_functor ); + + // Reference count starts at two: + // +1 to match decrement when task completes + // +1 for the future + f.m_task->m_queue = m_queue ; + f.m_task->m_ref_count = 2 ; + f.m_task->m_alloc_size = sizeof(task_type); + + assign( f.m_task , arg_options... ); + + // Potentially spawning outside execution space so the + // apply function pointer must be obtained from execution space. + // Required for Cuda execution space function pointer. + queue_type::specialization::template + proc_set_apply< FunctorType >( & f.m_task->m_apply ); + + m_queue->schedule( f.m_task ); + } + return f ; + } + + /**\brief Return a future that is complete + * when all input futures are complete. + */ + template< typename A1 , typename A2 > + KOKKOS_FUNCTION + Future< ExecSpace > + when_all( int narg , Future< A1 , A2 > const * const arg ) const + { + static_assert + ( std::is_same< execution_space + , typename Future< A1 , A2 >::execution_space + >::value + , "Future must have same execution space" ); + + using future_type = Future< ExecSpace > ; + using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ; + + future_type f ; + + size_t const size = sizeof(task_base) + narg * sizeof(task_base*); + + f.m_task = + reinterpret_cast< task_base * >( m_queue->allocate( size ) ); + + if ( f.m_task ) { + + new( f.m_task ) task_base(); + + // Reference count starts at two: + // +1 to match decrement when task completes + // +1 for the future + f.m_task->m_queue = m_queue ; + f.m_task->m_ref_count = 2 ; + f.m_task->m_alloc_size = size ; + f.m_task->m_dep_count = narg ; + f.m_task->m_task_type = task_base::Aggregate ; + + task_base ** const dep = f.m_task->aggregate_dependences(); + + // Assign dependences to increment their reference count + // The futures may be destroyed upon returning from this call + // so increment reference count to track this assignment. + + for ( int i = 0 ; i < narg ; ++i ) { + task_base * const t = dep[i] = arg[i].m_task ; + if ( 0 != t ) { + Kokkos::atomic_increment( &(t->m_ref_count) ); + } + } + + m_queue->schedule( f.m_task ); + // this when_all may be processed at any moment + } + + return f ; + } + + /**\brief An executing task respawns itself with options + * + * 1) High, Normal, or Low priority + * 2) With or without dependence + */ + template< class FunctorType , typename ... Options > + KOKKOS_FUNCTION + void respawn( FunctorType * task_self + , Options const & ... arg_options ) const + { + using value_type = typename FunctorType::value_type ; + using task_type = Impl::TaskBase< execution_space + , value_type + , FunctorType > ; + + task_base * const zero = (task_base *) 0 ; + task_base * const lock = (task_base *) task_base::LockTag ; + task_type * const task = static_cast< task_type * >( task_self ); + + // Precondition: + // task is in Executing state + // therefore m_next == LockTag + // + // Change to m_next == 0 for no dependence + + if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) { + Kokkos::abort("TaskScheduler::respawn ERROR: already respawned"); + } + + assign( task , arg_options... ); + + // Postcondition: + // task is in Executing-Respawn state + // therefore m_next == dependece or 0 + } + + //---------------------------------------- + + template< typename S > + friend + void Kokkos::wait( Kokkos::TaskScheduler< S > const & ); + + //---------------------------------------- + + inline + int allocation_capacity() const noexcept + { return m_queue->m_memory.get_mem_size(); } + + KOKKOS_INLINE_FUNCTION + int allocated_task_count() const noexcept + { return m_queue->m_count_alloc ; } + + KOKKOS_INLINE_FUNCTION + int allocated_task_count_max() const noexcept + { return m_queue->m_max_alloc ; } + + KOKKOS_INLINE_FUNCTION + long allocated_task_count_accum() const noexcept + { return m_queue->m_accum_alloc ; } + +}; + +template< typename ExecSpace > +inline +void wait( TaskScheduler< ExecSpace > const & policy ) +{ policy.m_queue->execute(); } + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_TASKSCHEDULER_HPP */ + diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp index c9ebbf9265..f01b14724a 100644 --- a/lib/kokkos/core/src/Kokkos_Threads.hpp +++ b/lib/kokkos/core/src/Kokkos_Threads.hpp @@ -189,6 +189,17 @@ public: namespace Kokkos { namespace Impl { +template<> +struct MemorySpaceAccess + < Kokkos::Threads::memory_space + , Kokkos::Threads::scratch_memory_space + > +{ + enum { assignable = false }; + enum { accessible = true }; + enum { deepcopy = false }; +}; + template<> struct VerifyExecutionCanAccessMemorySpace < Kokkos::Threads::memory_space diff --git a/lib/kokkos/core/src/Kokkos_Timer.hpp b/lib/kokkos/core/src/Kokkos_Timer.hpp new file mode 100644 index 0000000000..4eca5037e4 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Timer.hpp @@ -0,0 +1,112 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TIMER_HPP +#define KOKKOS_TIMER_HPP + +#include + +#ifdef _MSC_VER +#undef KOKKOS_USE_LIBRT +#include +#else +#ifdef KOKKOS_USE_LIBRT +#include +#else +#include +#endif +#endif + +namespace Kokkos { + +/** \brief Time since construction */ + +class Timer { +private: + #ifdef KOKKOS_USE_LIBRT + struct timespec m_old; + #else + struct timeval m_old ; + #endif + Timer( const Timer & ); + Timer & operator = ( const Timer & ); +public: + + inline + void reset() { + #ifdef KOKKOS_USE_LIBRT + clock_gettime(CLOCK_REALTIME, &m_old); + #else + gettimeofday( & m_old , ((struct timezone *) NULL ) ); + #endif + } + + inline + ~Timer() {} + + inline + Timer() { reset(); } + + inline + double seconds() const + { + #ifdef KOKKOS_USE_LIBRT + struct timespec m_new; + clock_gettime(CLOCK_REALTIME, &m_new); + + return ( (double) ( m_new.tv_sec - m_old.tv_sec ) ) + + ( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 ); + #else + struct timeval m_new ; + + gettimeofday( & m_new , ((struct timezone *) NULL ) ); + + return ( (double) ( m_new.tv_sec - m_old.tv_sec ) ) + + ( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 ); + #endif + } +}; + +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_TIMER_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index 1cc8b03381..b728b36492 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -61,9 +61,6 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template< class DstMemorySpace , class SrcMemorySpace > -struct DeepCopy ; - template< class DataType > struct ViewArrayAnalysis ; @@ -76,31 +73,23 @@ struct ViewDataAnalysis ; template< class , class ... > class ViewMapping { public: enum { is_assignable = false }; }; -template< class MemorySpace > -struct ViewOperatorBoundsErrorAbort ; - -template<> -struct ViewOperatorBoundsErrorAbort< Kokkos::HostSpace > { - static void apply( const size_t rank - , const size_t n0 , const size_t n1 - , const size_t n2 , const size_t n3 - , const size_t n4 , const size_t n5 - , const size_t n6 , const size_t n7 - , const size_t i0 , const size_t i1 - , const size_t i2 , const size_t i3 - , const size_t i4 , const size_t i5 - , const size_t i6 , const size_t i7 ); -}; - } /* namespace Impl */ } /* namespace Experimental */ } /* namespace Kokkos */ +namespace Kokkos { +namespace Impl { + +using Kokkos::Experimental::Impl::ViewMapping ; +using Kokkos::Experimental::Impl::ViewDataAnalysis ; + +} /* namespace Impl */ +} /* namespace Kokkos */ + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { /** \class ViewTraits * \brief Traits class for accessing attributes of a View. @@ -168,8 +157,7 @@ struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_space::value typedef typename Space::execution_space execution_space ; typedef typename Space::memory_space memory_space ; - typedef typename Kokkos::Impl::is_space< Space >::host_mirror_space - HostMirrorSpace ; + typedef typename Kokkos::Impl::HostMirror< Space >::Space HostMirrorSpace ; typedef typename execution_space::array_layout array_layout ; typedef typename ViewTraits::memory_traits memory_traits ; }; @@ -225,7 +213,7 @@ private: std::conditional < ! std::is_same< typename prop::HostMirrorSpace , void >::value , typename prop::HostMirrorSpace - , typename Kokkos::Impl::is_space< ExecutionSpace >::host_mirror_space + , typename Kokkos::Impl::HostMirror< ExecutionSpace >::Space >::type HostMirrorSpace ; @@ -238,7 +226,7 @@ private: // Analyze data type's properties, // May be specialized based upon the layout and value type - typedef Kokkos::Experimental::Impl::ViewDataAnalysis< DataType , ArrayLayout > data_analysis ; + typedef Kokkos::Impl::ViewDataAnalysis< DataType , ArrayLayout > data_analysis ; public: @@ -376,31 +364,29 @@ public: template< class DataType , class ... Properties > class View ; -} /* namespace Experimental */ } /* namespace Kokkos */ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#include -#include +#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { namespace { -constexpr Kokkos::Experimental::Impl::ALL_t - ALL = Kokkos::Experimental::Impl::ALL_t(); +constexpr Kokkos::Impl::ALL_t + ALL = Kokkos::Impl::ALL_t(); -constexpr Kokkos::Experimental::Impl::WithoutInitializing_t - WithoutInitializing = Kokkos::Experimental::Impl::WithoutInitializing_t(); +constexpr Kokkos::Impl::WithoutInitializing_t + WithoutInitializing = Kokkos::Impl::WithoutInitializing_t(); -constexpr Kokkos::Experimental::Impl::AllowPadding_t - AllowPadding = Kokkos::Experimental::Impl::AllowPadding_t(); +constexpr Kokkos::Impl::AllowPadding_t + AllowPadding = Kokkos::Impl::AllowPadding_t(); } @@ -446,14 +432,12 @@ view_wrap( Args const & ... args ) return return_type( args... ); } -} /* namespace Experimental */ } /* namespace Kokkos */ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { -namespace Experimental { template< class DataType , class ... Properties > class View ; @@ -471,7 +455,7 @@ class View : public ViewTraits< DataType , Properties ... > { private: template< class , class ... > friend class View ; - template< class , class ... > friend class Impl::ViewMapping ; + template< class , class ... > friend class Kokkos::Impl::ViewMapping ; public: @@ -479,8 +463,8 @@ public: private: - typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ; - typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ; + typedef Kokkos::Impl::ViewMapping< traits , void > map_type ; + typedef Kokkos::Impl::SharedAllocationTracker track_type ; track_type m_track ; map_type m_map ; @@ -607,7 +591,7 @@ public: // Allow specializations to query their specialized map KOKKOS_INLINE_FUNCTION - const Kokkos::Experimental::Impl::ViewMapping< traits , void > & + const Kokkos::Impl::ViewMapping< traits , void > & implementation_map() const { return m_map ; } //---------------------------------------- @@ -629,18 +613,24 @@ private: ( is_layout_left || is_layout_right || is_layout_stride ) }; + template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space + { KOKKOS_FORCEINLINE_FUNCTION static void check() {} }; + + template< class Space > struct verify_space + { KOKKOS_FORCEINLINE_FUNCTION static void check() + { Kokkos::abort("Kokkos::View ERROR: attempt to access inaccessible memory space"); }; + }; + #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) #define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \ - < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \ - Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ; + View::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); \ + Kokkos::Impl::view_verify_operator_bounds ARG ; #else #define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \ - < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); + View::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); #endif @@ -656,7 +646,11 @@ public: ), reference_type >::type operator()( Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,args...) ) + #endif return m_map.reference(); } @@ -675,7 +669,11 @@ public: operator()( const I0 & i0 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,args...) ) + #endif return m_map.reference(i0); } @@ -692,7 +690,12 @@ public: operator()( const I0 & i0 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) ) + + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,args...) ) + #endif return m_map.m_handle[ i0 ]; } @@ -709,7 +712,11 @@ public: operator()( const I0 & i0 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,args...) ) + #endif return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ]; } @@ -726,7 +733,11 @@ public: ), reference_type >::type operator[]( const I0 & i0 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0) ) + #endif return m_map.reference(i0); } @@ -741,7 +752,11 @@ public: ), reference_type >::type operator[]( const I0 & i0 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0) ) + #endif return m_map.m_handle[ i0 ]; } @@ -756,7 +771,11 @@ public: ), reference_type >::type operator[]( const I0 & i0 ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0) ) + #endif return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ]; } @@ -775,7 +794,11 @@ public: operator()( const I0 & i0 , const I1 & i1 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,args...) ) + #endif return m_map.reference(i0,i1); } @@ -792,7 +815,11 @@ public: operator()( const I0 & i0 , const I1 & i1 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,args...) ) + #endif return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ]; } @@ -809,7 +836,11 @@ public: operator()( const I0 & i0 , const I1 & i1 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,args...) ) + #endif return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ]; } @@ -826,7 +857,11 @@ public: operator()( const I0 & i0 , const I1 & i1 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,args...) ) + #endif return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ]; } @@ -843,7 +878,11 @@ public: operator()( const I0 & i0 , const I1 & i1 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,args...) ) + #endif return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ]; } @@ -860,7 +899,11 @@ public: operator()( const I0 & i0 , const I1 & i1 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,args...) ) + #endif return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 + i1 * m_map.m_offset.m_stride.S1 ]; @@ -880,7 +923,11 @@ public: operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,args...) ) + #endif return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ]; } @@ -896,7 +943,11 @@ public: operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,args...) ) + #endif return m_map.reference(i0,i1,i2); } @@ -915,7 +966,11 @@ public: operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,args...) ) + #endif return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ]; } @@ -931,7 +986,11 @@ public: operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,args...) ) + #endif return m_map.reference(i0,i1,i2,i3); } @@ -952,7 +1011,11 @@ public: , const I4 & i4 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,args...) ) + #endif return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ]; } @@ -970,7 +1033,11 @@ public: , const I4 & i4 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,args...) ) + #endif return m_map.reference(i0,i1,i2,i3,i4); } @@ -991,7 +1058,11 @@ public: , const I4 & i4 , const I5 & i5 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,i5,args...) ) + #endif return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ]; } @@ -1009,7 +1080,11 @@ public: , const I4 & i4 , const I5 & i5 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,i5,args...) ) + #endif return m_map.reference(i0,i1,i2,i3,i4,i5); } @@ -1030,7 +1105,11 @@ public: , const I4 & i4 , const I5 & i5 , const I6 & i6 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,args...) ) + #endif return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ]; } @@ -1048,7 +1127,11 @@ public: , const I4 & i4 , const I5 & i5 , const I6 & i6 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,args...) ) + #endif return m_map.reference(i0,i1,i2,i3,i4,i5,i6); } @@ -1069,7 +1152,11 @@ public: , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) ) + #endif return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ]; } @@ -1087,7 +1174,11 @@ public: , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 , Args ... args ) const { - KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) ) + #ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + KOKKOS_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) ) + #else + KOKKOS_VIEW_OPERATOR_VERIFY( (m_track.template get_label().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) ) + #endif return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7); } @@ -1126,7 +1217,7 @@ public: , m_map() { typedef typename View::traits SrcTraits ; - typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; static_assert( Mapping::is_assignable , "Incompatible View copy construction" ); Mapping::assign( m_map , rhs.m_map , rhs.m_track ); } @@ -1136,7 +1227,7 @@ public: View & operator = ( const View & rhs ) { typedef typename View::traits SrcTraits ; - typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; + typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ; static_assert( Mapping::is_assignable , "Incompatible View copy assignment" ); Mapping::assign( m_map , rhs.m_map , rhs.m_track ); m_track.assign( rhs.m_track , traits::is_managed ); @@ -1156,14 +1247,14 @@ public: { typedef View< RT , RP... > SrcType ; - typedef Kokkos::Experimental::Impl::ViewMapping + typedef Kokkos::Impl::ViewMapping < void /* deduce destination view type from source view traits */ , typename SrcType::traits , Arg0 , Args... > Mapping ; typedef typename Mapping::type DstType ; - static_assert( Kokkos::Experimental::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable + static_assert( Kokkos::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable , "Subview construction requires compatible view and subview arguments" ); Mapping::assign( m_map, src_view.m_map, arg0 , args... ); @@ -1243,7 +1334,7 @@ public: #endif //------------------------------------------------------------ - Kokkos::Experimental::Impl::SharedAllocationRecord<> * + Kokkos::Impl::SharedAllocationRecord<> * record = m_map.allocate_shared( prop , arg_layout ); //------------------------------------------------------------ @@ -1324,7 +1415,7 @@ public: explicit inline View( const Label & arg_label , typename std::enable_if< - Kokkos::Experimental::Impl::is_view_label