//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #include #include #include #include #include #include #include #include // FIXME: Benchmark_Context.hpp should be moved to a common location #include "../../core/perf_test/Benchmark_Context.hpp" namespace { namespace KE = Kokkos::Experimental; using ExecSpace = Kokkos::DefaultExecutionSpace; using HostExecSpace = Kokkos::DefaultHostExecutionSpace; // A tag struct to identify when inclusive scan with the implicit sum // based binary operation needs to be called. template struct ImpSumBinOp; template struct SumFunctor { KOKKOS_FUNCTION ValueType operator()(const ValueType& a, const ValueType& b) const { return (a + b); } }; template struct MaxFunctor { KOKKOS_FUNCTION ValueType operator()(const ValueType& a, const ValueType& b) const { if (a > b) return a; else return b; } }; // Helper to obtain last element of a view template T obtain_last_elem(const Kokkos::View& v) { T last_element; Kokkos::deep_copy(last_element, Kokkos::subview(v, v.extent(0) - 1)); return last_element; } // Helper to allocate input and output views template auto prepare_views(const std::size_t kProbSize) { Kokkos::View in{"input", kProbSize}; Kokkos::View out{"output", kProbSize}; auto h_in = Kokkos::create_mirror_view(in); for (std::size_t i = 0; i < kProbSize; ++i) { h_in(i) = i; } Kokkos::deep_copy(in, h_in); return std::make_tuple(in, out, h_in); } // Perform scan with a reference implementation template > T ref_scan(const ViewType& h_in, ScanFunctor scan_functor = ScanFunctor()) { std::size_t view_size = h_in.extent(0); Kokkos::View h_out("output", view_size); // FIXME: We have GCC 8.4.0 based check in our ORNL Jenkins CI. // std::inclusive_scan is available only from GCC 9.3. Since, GCC 9.1 // std::inclusive_scan that takes execution policy is available. However, // there is error with header before GCC 10.1. h_out(0) = h_in(0); for (std::size_t i = 1; i < view_size; ++i) { h_out(i) = scan_functor(h_in(i), h_out(i - 1)); } return h_out(view_size - 1); } // Inclusive Scan with default binary operation (sum) or user provided functor // Note: The nature of the functor must be compatible with the // elements in the input and output views template class ScanFunctor = ImpSumBinOp> auto inclusive_scan(const Kokkos::View& in, const Kokkos::View& out, T res_check) { ExecSpace().fence(); Kokkos::Timer timer; if constexpr (std::is_same_v, ImpSumBinOp>) { KE::inclusive_scan("Default scan", ExecSpace(), KE::cbegin(in), KE::cend(in), KE::begin(out)); } else { KE::inclusive_scan("Scan using a functor", ExecSpace(), KE::cbegin(in), KE::cend(in), KE::begin(out), ScanFunctor()); } ExecSpace().fence(); double time_scan = timer.seconds(); T res_scan = obtain_last_elem(out); bool passed = (res_check == res_scan); return std::make_tuple(time_scan, passed); } // Benchmark: Inclusive Scan with default binary operation (sum) // or user provided functor template class ScanFunctor = ImpSumBinOp> void BM_inclusive_scan(benchmark::State& state) { const std::size_t kProbSize = state.range(0); auto [in, out, h_in] = prepare_views(kProbSize); T res_check; if constexpr (std::is_same_v, ImpSumBinOp>) { res_check = ref_scan(h_in); } else { res_check = ref_scan(h_in, ScanFunctor()); } double time_scan = 0.; bool passed = false; for (auto _ : state) { if constexpr (std::is_same_v, ImpSumBinOp>) { std::tie(time_scan, passed) = inclusive_scan(in, out, res_check); } else { std::tie(time_scan, passed) = inclusive_scan(in, out, res_check); } KokkosBenchmark::report_results(state, in, 2, time_scan); state.counters["Passed"] = passed; } } constexpr std::size_t PROB_SIZE = 100'000'000; } // anonymous namespace // FIXME: Add logic to pass min. warm-up time. Also, the value should be set // by the user. Say, via the environment variable BENCHMARK_MIN_WARMUP_TIME. BENCHMARK(BM_inclusive_scan)->Arg(PROB_SIZE)->UseManualTime(); BENCHMARK(BM_inclusive_scan)->Arg(PROB_SIZE)->UseManualTime(); BENCHMARK(BM_inclusive_scan)->Arg(PROB_SIZE)->UseManualTime(); BENCHMARK(BM_inclusive_scan) ->Arg(PROB_SIZE) ->UseManualTime(); BENCHMARK(BM_inclusive_scan) ->Arg(PROB_SIZE) ->UseManualTime(); BENCHMARK(BM_inclusive_scan) ->Arg(PROB_SIZE) ->UseManualTime(); BENCHMARK(BM_inclusive_scan) ->Arg(PROB_SIZE) ->UseManualTime(); BENCHMARK(BM_inclusive_scan) ->Arg(PROB_SIZE) ->UseManualTime(); BENCHMARK(BM_inclusive_scan) ->Arg(PROB_SIZE) ->UseManualTime();