Update Kokkos library in LAMMPS to v4.1.0
This commit is contained in:
@ -16,15 +16,20 @@
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <Benchmark_Context.hpp>
|
||||
#include "Benchmark_Context.hpp"
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include "PerfTest_Category.hpp"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
Kokkos::initialize(argc, argv);
|
||||
benchmark::Initialize(&argc, argv);
|
||||
benchmark::SetDefaultTimeUnit(benchmark::kSecond);
|
||||
KokkosBenchmark::add_benchmark_context(true);
|
||||
|
||||
(void)Test::command_line_num_args(argc);
|
||||
(void)Test::command_line_arg(0, argv);
|
||||
|
||||
benchmark::RunSpecifiedBenchmarks();
|
||||
|
||||
benchmark::Shutdown();
|
||||
|
||||
81
lib/kokkos/core/perf_test/Benchmark_Context.cpp
Normal file
81
lib/kokkos/core/perf_test/Benchmark_Context.cpp
Normal file
@ -0,0 +1,81 @@
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 4.0
|
||||
// Copyright (2022) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://kokkos.org/LICENSE for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include "Benchmark_Context.hpp"
|
||||
|
||||
namespace KokkosBenchmark {
|
||||
|
||||
/**
|
||||
* \brief Remove unwanted spaces and colon signs from input string. In case of
|
||||
* invalid input it will return an empty string.
|
||||
*/
|
||||
std::string remove_unwanted_characters(const std::string& str) {
|
||||
auto from = str.find_first_not_of(" :");
|
||||
auto to = str.find_last_not_of(" :");
|
||||
|
||||
if (from == std::string::npos || to == std::string::npos) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// return extracted part of string without unwanted spaces and colon signs
|
||||
return str.substr(from, to + 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Extract all key:value pairs from kokkos configuration and add it to
|
||||
* the benchmark context
|
||||
*/
|
||||
void add_kokkos_configuration(bool verbose) {
|
||||
std::ostringstream msg;
|
||||
Kokkos::print_configuration(msg, verbose);
|
||||
|
||||
// Iterate over lines returned from kokkos and extract key:value pairs
|
||||
std::stringstream ss{msg.str()};
|
||||
for (std::string line; std::getline(ss, line, '\n');) {
|
||||
auto found = line.find_first_of(':');
|
||||
if (found != std::string::npos) {
|
||||
auto val = remove_unwanted_characters(line.substr(found + 1));
|
||||
// Ignore line without value, for example a category name
|
||||
if (!val.empty()) {
|
||||
benchmark::AddCustomContext(
|
||||
remove_unwanted_characters(line.substr(0, found)), val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void add_git_info() {
|
||||
if (!Kokkos::Impl::GIT_BRANCH.empty()) {
|
||||
benchmark::AddCustomContext("GIT_BRANCH", Kokkos::Impl::GIT_BRANCH);
|
||||
benchmark::AddCustomContext("GIT_COMMIT_HASH",
|
||||
Kokkos::Impl::GIT_COMMIT_HASH);
|
||||
benchmark::AddCustomContext("GIT_CLEAN_STATUS",
|
||||
Kokkos::Impl::GIT_CLEAN_STATUS);
|
||||
benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION",
|
||||
Kokkos::Impl::GIT_COMMIT_DESCRIPTION);
|
||||
benchmark::AddCustomContext("GIT_COMMIT_DATE",
|
||||
Kokkos::Impl::GIT_COMMIT_DATE);
|
||||
}
|
||||
}
|
||||
|
||||
void add_benchmark_context(bool verbose) {
|
||||
// Add Kokkos configuration to benchmark context data
|
||||
add_kokkos_configuration(verbose);
|
||||
// Add git information to benchmark context data
|
||||
add_git_info();
|
||||
}
|
||||
|
||||
} // namespace KokkosBenchmark
|
||||
@ -26,62 +26,34 @@
|
||||
|
||||
namespace KokkosBenchmark {
|
||||
|
||||
/// \brief Remove unwanted spaces and colon signs from input string. In case of
|
||||
/// invalid input it will return an empty string.
|
||||
std::string remove_unwanted_characters(std::string str) {
|
||||
auto from = str.find_first_not_of(" :");
|
||||
auto to = str.find_last_not_of(" :");
|
||||
/**
|
||||
* \brief Gather all context information and add it to benchmark context data
|
||||
*/
|
||||
void add_benchmark_context(bool verbose = false);
|
||||
|
||||
if (from == std::string::npos || to == std::string::npos) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// return extracted part of string without unwanted spaces and colon signs
|
||||
return str.substr(from, to + 1);
|
||||
/**
|
||||
* \brief Mark the label as a figure of merit.
|
||||
*/
|
||||
inline std::string benchmark_fom(const std::string& label) {
|
||||
return "FOM: " + label;
|
||||
}
|
||||
|
||||
/// \brief Extract all key:value pairs from kokkos configuration and add it to
|
||||
/// the benchmark context
|
||||
void add_kokkos_configuration(bool verbose) {
|
||||
std::ostringstream msg;
|
||||
Kokkos::print_configuration(msg, verbose);
|
||||
/**
|
||||
* \brief Report throughput and amount of data processed for simple View
|
||||
* operations
|
||||
*/
|
||||
template <class ViewType>
|
||||
void report_results(benchmark::State& state, ViewType view, int data_ratio,
|
||||
double time) {
|
||||
// data processed in megabytes
|
||||
const double data_processed = data_ratio * view.size() *
|
||||
sizeof(typename ViewType::value_type) /
|
||||
1'000'000;
|
||||
|
||||
// Iterate over lines returned from kokkos and extract key:value pairs
|
||||
std::stringstream ss{msg.str()};
|
||||
for (std::string line; std::getline(ss, line, '\n');) {
|
||||
auto found = line.find_first_of(':');
|
||||
if (found != std::string::npos) {
|
||||
auto val = remove_unwanted_characters(line.substr(found + 1));
|
||||
// Ignore line without value, for example a category name
|
||||
if (!val.empty()) {
|
||||
benchmark::AddCustomContext(
|
||||
remove_unwanted_characters(line.substr(0, found)), val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Add all data related to git to benchmark context
|
||||
void add_git_info() {
|
||||
if (!Kokkos::Impl::GIT_BRANCH.empty()) {
|
||||
benchmark::AddCustomContext("GIT_BRANCH", Kokkos::Impl::GIT_BRANCH);
|
||||
benchmark::AddCustomContext("GIT_COMMIT_HASH",
|
||||
Kokkos::Impl::GIT_COMMIT_HASH);
|
||||
benchmark::AddCustomContext("GIT_CLEAN_STATUS",
|
||||
Kokkos::Impl::GIT_CLEAN_STATUS);
|
||||
benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION",
|
||||
Kokkos::Impl::GIT_COMMIT_DESCRIPTION);
|
||||
benchmark::AddCustomContext("GIT_COMMIT_DATE",
|
||||
Kokkos::Impl::GIT_COMMIT_DATE);
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Gather all context information and add it to benchmark context data
|
||||
void add_benchmark_context(bool verbose = false) {
|
||||
// Add Kokkos configuration to benchmark context data
|
||||
add_kokkos_configuration(verbose);
|
||||
// Add git information to benchmark context data
|
||||
add_git_info();
|
||||
state.SetIterationTime(time);
|
||||
state.counters["MB"] = benchmark::Counter(data_processed);
|
||||
state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
|
||||
data_processed / 1'000, benchmark::Counter::kIsIterationInvariantRate);
|
||||
}
|
||||
|
||||
} // namespace KokkosBenchmark
|
||||
|
||||
@ -1,108 +1,31 @@
|
||||
|
||||
#INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
#INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
|
||||
# warning: PerfTest_CustomReduction.cpp uses
|
||||
# ../../algorithms/src/Kokkos_Random.hpp
|
||||
# we'll just allow it to be included, but note
|
||||
# that in TriBITS KokkosAlgorithms can be disabled...
|
||||
#INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
|
||||
|
||||
# FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests.
|
||||
# FIXME_OPENACC - temporarily disabled due to unimplemented features
|
||||
IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
|
||||
RETURN()
|
||||
ENDIF()
|
||||
|
||||
# all PerformanceTest_* executables are part of regular tests
|
||||
# TODO: finish converting these into benchmarks (in progress)
|
||||
IF(KOKKOS_ENABLE_TESTS)
|
||||
IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL)
|
||||
KOKKOS_ADD_EXECUTABLE (
|
||||
PerformanceTest_SharedSpace
|
||||
SOURCES test_sharedSpace.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
SET(SOURCES
|
||||
PerfTestMain.cpp
|
||||
PerfTestGramSchmidt.cpp
|
||||
PerfTestHexGrad.cpp
|
||||
PerfTest_CustomReduction.cpp
|
||||
PerfTest_ExecSpacePartitioning.cpp
|
||||
PerfTest_ViewAllocate.cpp
|
||||
PerfTest_ViewFill_123.cpp
|
||||
PerfTest_ViewFill_45.cpp
|
||||
PerfTest_ViewFill_6.cpp
|
||||
PerfTest_ViewFill_7.cpp
|
||||
PerfTest_ViewFill_8.cpp
|
||||
PerfTest_ViewResize_123.cpp
|
||||
PerfTest_ViewResize_45.cpp
|
||||
PerfTest_ViewResize_6.cpp
|
||||
PerfTest_ViewResize_7.cpp
|
||||
PerfTest_ViewResize_8.cpp
|
||||
)
|
||||
KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
IF(Kokkos_ENABLE_OPENMPTARGET)
|
||||
# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction
|
||||
LIST(REMOVE_ITEM SOURCES
|
||||
PerfTestGramSchmidt.cpp
|
||||
PerfTest_CustomReduction.cpp
|
||||
PerfTest_ExecSpacePartitioning.cpp
|
||||
)
|
||||
IF(NOT Kokkos_ENABLE_OPENMPTARGET)
|
||||
# FIXME OPENMPTARGET needs tasking
|
||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerformanceTest_TaskDag
|
||||
SOURCES test_taskdag.cpp
|
||||
CATEGORIES PERFORMANCE
|
||||
)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL)
|
||||
KOKKOS_ADD_EXECUTABLE (
|
||||
PerformanceTest_SharedSpace
|
||||
SOURCES test_sharedSpace.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
# Per #374, we always want to build this test, but we only want to run
|
||||
# it as a PERFORMANCE test. That's why we separate building the test
|
||||
# from running the test.
|
||||
|
||||
#leave these as basic includes for now
|
||||
#I don't need anything transitive
|
||||
KOKKOS_INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
|
||||
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
# This test currently times out for MSVC
|
||||
IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerfTestExec
|
||||
SOURCES ${SOURCES}
|
||||
CATEGORIES PERFORMANCE
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerformanceTest_Atomic
|
||||
SOURCES test_atomic.cpp
|
||||
CATEGORIES PERFORMANCE
|
||||
)
|
||||
|
||||
IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA)
|
||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerformanceTest_Atomic_MinMax
|
||||
SOURCES test_atomic_minmax_simple.cpp
|
||||
CATEGORIES PERFORMANCE
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
# FIXME_NVHPC
|
||||
IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
|
||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerformanceTest_Mempool
|
||||
SOURCES test_mempool.cpp
|
||||
CATEGORIES PERFORMANCE
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(NOT Kokkos_ENABLE_OPENMPTARGET)
|
||||
# FIXME OPENMPTARGET needs tasking
|
||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerformanceTest_TaskDag
|
||||
SOURCES test_taskdag.cpp
|
||||
CATEGORIES PERFORMANCE
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
|
||||
IF(NOT Kokkos_ENABLE_BENCHMARKS)
|
||||
RETURN()
|
||||
ENDIF()
|
||||
@ -111,6 +34,7 @@ IF (KOKKOS_HAS_TRILINOS)
|
||||
message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos")
|
||||
ENDIF()
|
||||
|
||||
# Find or download google/benchmark library
|
||||
find_package(benchmark QUIET)
|
||||
IF(benchmark_FOUND)
|
||||
MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}")
|
||||
@ -119,7 +43,7 @@ ELSE()
|
||||
include(FetchContent)
|
||||
SET(BENCHMARK_ENABLE_TESTING OFF)
|
||||
|
||||
list(APPEND CMAKE_MESSAGE_INDENT " ")
|
||||
list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ")
|
||||
FetchContent_Declare(
|
||||
googlebenchmark
|
||||
URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz
|
||||
@ -128,8 +52,6 @@ ELSE()
|
||||
FetchContent_MakeAvailable(googlebenchmark)
|
||||
list(POP_BACK CMAKE_MESSAGE_INDENT)
|
||||
|
||||
include_directories(${benchmark_SOURCE_DIR}/include)
|
||||
|
||||
# Suppress clang-tidy diagnostics on code that we do not have control over
|
||||
IF(CMAKE_CXX_CLANG_TIDY)
|
||||
SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "")
|
||||
@ -157,6 +79,10 @@ FUNCTION(KOKKOS_ADD_BENCHMARK NAME)
|
||||
ENDIF()
|
||||
|
||||
SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME})
|
||||
LIST(APPEND BENCHMARK_SOURCES
|
||||
BenchmarkMain.cpp
|
||||
Benchmark_Context.cpp
|
||||
)
|
||||
|
||||
ADD_EXECUTABLE(
|
||||
${BENCHMARK_NAME}
|
||||
@ -166,6 +92,11 @@ FUNCTION(KOKKOS_ADD_BENCHMARK NAME)
|
||||
${BENCHMARK_NAME}
|
||||
PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version
|
||||
)
|
||||
TARGET_INCLUDE_DIRECTORIES(
|
||||
${BENCHMARK_NAME}
|
||||
SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES})
|
||||
SET_SOURCE_FILES_PROPERTIES(
|
||||
${SOURCE_FILE}
|
||||
@ -188,7 +119,11 @@ ENDFUNCTION()
|
||||
|
||||
SET(
|
||||
BENCHMARK_SOURCES
|
||||
BenchmarkMain.cpp
|
||||
PerfTestGramSchmidt.cpp
|
||||
PerfTest_CustomReduction.cpp
|
||||
PerfTest_ExecSpacePartitioning.cpp
|
||||
PerfTestHexGrad.cpp
|
||||
PerfTest_ViewAllocate.cpp
|
||||
PerfTest_ViewCopy_a123.cpp
|
||||
PerfTest_ViewCopy_b123.cpp
|
||||
PerfTest_ViewCopy_c123.cpp
|
||||
@ -210,9 +145,50 @@ SET(
|
||||
PerfTest_ViewCopy_c8.cpp
|
||||
PerfTest_ViewCopy_d8.cpp
|
||||
PerfTest_ViewCopy_Raw.cpp
|
||||
PerfTest_ViewFill_123.cpp
|
||||
PerfTest_ViewFill_45.cpp
|
||||
PerfTest_ViewFill_6.cpp
|
||||
PerfTest_ViewFill_7.cpp
|
||||
PerfTest_ViewFill_8.cpp
|
||||
PerfTest_ViewFill_Raw.cpp
|
||||
PerfTest_ViewResize_123.cpp
|
||||
PerfTest_ViewResize_45.cpp
|
||||
PerfTest_ViewResize_6.cpp
|
||||
PerfTest_ViewResize_7.cpp
|
||||
PerfTest_ViewResize_8.cpp
|
||||
PerfTest_ViewResize_Raw.cpp
|
||||
)
|
||||
|
||||
IF(Kokkos_ENABLE_OPENMPTARGET)
|
||||
# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction
|
||||
LIST(REMOVE_ITEM BENCHMARK_SOURCES
|
||||
PerfTestGramSchmidt.cpp
|
||||
PerfTest_CustomReduction.cpp
|
||||
PerfTest_ExecSpacePartitioning.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
KOKKOS_ADD_BENCHMARK(
|
||||
PerformanceTest_Benchmark
|
||||
SOURCES ${BENCHMARK_SOURCES}
|
||||
)
|
||||
|
||||
IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA)
|
||||
KOKKOS_ADD_BENCHMARK(
|
||||
Benchmark_Atomic_MinMax
|
||||
SOURCES test_atomic_minmax_simple.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
# FIXME_NVHPC
|
||||
IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
|
||||
KOKKOS_ADD_BENCHMARK(
|
||||
PerformanceTest_Mempool
|
||||
SOURCES test_mempool.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
KOKKOS_ADD_BENCHMARK(
|
||||
PerformanceTest_Atomic
|
||||
SOURCES test_atomic.cpp
|
||||
)
|
||||
|
||||
@ -14,7 +14,7 @@ else
|
||||
CXX = g++
|
||||
endif
|
||||
|
||||
CXXFLAGS = -O3
|
||||
CXXFLAGS = -O3
|
||||
#CXXFLAGS += -DGENERIC_REDUCER
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?=
|
||||
@ -29,43 +29,12 @@ TARGETS =
|
||||
|
||||
#
|
||||
|
||||
OBJ_PERF = PerfTestMain.o gtest-all.o
|
||||
OBJ_PERF += PerfTest_ExecSpacePartitioning.o
|
||||
OBJ_PERF += PerfTestGramSchmidt.o
|
||||
OBJ_PERF += PerfTestHexGrad.o
|
||||
OBJ_PERF += PerfTest_CustomReduction.o
|
||||
OBJ_PERF += PerfTest_ViewAllocate.o
|
||||
OBJ_PERF += PerfTest_ViewFill_123.o PerfTest_ViewFill_45.o PerfTest_ViewFill_6.o PerfTest_ViewFill_7.o PerfTest_ViewFill_8.o
|
||||
OBJ_PERF += PerfTest_ViewResize_123.o PerfTest_ViewResize_45.o PerfTest_ViewResize_6.o PerfTest_ViewResize_7.o PerfTest_ViewResize_8.o
|
||||
TARGETS += KokkosCore_PerformanceTest
|
||||
TEST_TARGETS += test-performance
|
||||
|
||||
#
|
||||
|
||||
OBJ_ATOMICS = test_atomic.o
|
||||
TARGETS += KokkosCore_PerformanceTest_Atomics
|
||||
TEST_TARGETS += test-atomic
|
||||
|
||||
#
|
||||
|
||||
OBJ_MEMPOOL = test_mempool.o
|
||||
TARGETS += KokkosCore_PerformanceTest_Mempool
|
||||
TEST_TARGETS += test-mempool
|
||||
|
||||
#
|
||||
|
||||
OBJ_TASKDAG = test_taskdag.o
|
||||
OBJ_TASKDAG = test_taskdag.o
|
||||
TARGETS += KokkosCore_PerformanceTest_TaskDAG
|
||||
TEST_TARGETS += test-taskdag
|
||||
|
||||
#
|
||||
|
||||
OBJ_ATOMICS_MINMAX = test_atomic_minmax_simple.o
|
||||
TARGETS += KokkosCore_PerformanceTest_Atomics_MinMax
|
||||
TEST_TARGETS += test-atomic-minmax
|
||||
|
||||
#
|
||||
|
||||
KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest
|
||||
|
||||
|
||||
@ -25,8 +25,8 @@ template <class Type>
|
||||
struct Dot {
|
||||
using execution_space = typename Type::execution_space;
|
||||
|
||||
static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
|
||||
"Dot static_assert Fail: Rank != 1");
|
||||
static_assert(static_cast<unsigned>(Type::rank) == static_cast<unsigned>(1),
|
||||
"Dot static_assert Fail: rank != 1");
|
||||
|
||||
using value_type = double;
|
||||
|
||||
@ -56,8 +56,8 @@ template <class Type>
|
||||
struct DotSingle {
|
||||
using execution_space = typename Type::execution_space;
|
||||
|
||||
static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
|
||||
"DotSingle static_assert Fail: Rank != 1");
|
||||
static_assert(static_cast<unsigned>(Type::rank) == static_cast<unsigned>(1),
|
||||
"DotSingle static_assert Fail: rank != 1");
|
||||
|
||||
using value_type = double;
|
||||
|
||||
@ -88,13 +88,13 @@ template <class ScalarType, class VectorType>
|
||||
struct Scale {
|
||||
using execution_space = typename VectorType::execution_space;
|
||||
|
||||
static_assert(static_cast<unsigned>(ScalarType::Rank) ==
|
||||
static_assert(static_cast<unsigned>(ScalarType::rank) ==
|
||||
static_cast<unsigned>(0),
|
||||
"Scale static_assert Fail: ScalarType::Rank != 0");
|
||||
"Scale static_assert Fail: ScalarType::rank != 0");
|
||||
|
||||
static_assert(static_cast<unsigned>(VectorType::Rank) ==
|
||||
static_assert(static_cast<unsigned>(VectorType::rank) ==
|
||||
static_cast<unsigned>(1),
|
||||
"Scale static_assert Fail: VectorType::Rank != 1");
|
||||
"Scale static_assert Fail: VectorType::rank != 1");
|
||||
|
||||
#if 1
|
||||
typename ScalarType::const_type alpha;
|
||||
@ -115,17 +115,17 @@ template <class ScalarType, class ConstVectorType, class VectorType>
|
||||
struct AXPBY {
|
||||
using execution_space = typename VectorType::execution_space;
|
||||
|
||||
static_assert(static_cast<unsigned>(ScalarType::Rank) ==
|
||||
static_assert(static_cast<unsigned>(ScalarType::rank) ==
|
||||
static_cast<unsigned>(0),
|
||||
"AXPBY static_assert Fail: ScalarType::Rank != 0");
|
||||
"AXPBY static_assert Fail: ScalarType::rank != 0");
|
||||
|
||||
static_assert(static_cast<unsigned>(ConstVectorType::Rank) ==
|
||||
static_assert(static_cast<unsigned>(ConstVectorType::rank) ==
|
||||
static_cast<unsigned>(1),
|
||||
"AXPBY static_assert Fail: ConstVectorType::Rank != 1");
|
||||
"AXPBY static_assert Fail: ConstVectorType::rank != 1");
|
||||
|
||||
static_assert(static_cast<unsigned>(VectorType::Rank) ==
|
||||
static_assert(static_cast<unsigned>(VectorType::rank) ==
|
||||
static_cast<unsigned>(1),
|
||||
"AXPBY static_assert Fail: VectorType::Rank != 1");
|
||||
"AXPBY static_assert Fail: VectorType::rank != 1");
|
||||
|
||||
#if 1
|
||||
typename ScalarType::const_type alpha, beta;
|
||||
|
||||
@ -15,11 +15,11 @@
|
||||
//@HEADER
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <PerfTest_Category.hpp>
|
||||
#include <benchmark/benchmark.h>
|
||||
#include "PerfTest_Category.hpp"
|
||||
|
||||
#include <cmath>
|
||||
#include <PerfTestBlasKernels.hpp>
|
||||
#include "PerfTestBlasKernels.hpp"
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -137,87 +137,61 @@ struct ModifiedGramSchmidt {
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
static double test(const size_type length, const size_type count,
|
||||
const size_t iter = 1) {
|
||||
static double test(const size_type length, const size_type count) {
|
||||
multivector_type Q_("Q", length, count);
|
||||
multivector_type R_("R", count, count);
|
||||
|
||||
typename multivector_type::HostMirror A = Kokkos::create_mirror(Q_);
|
||||
|
||||
// Create and fill A on the host
|
||||
|
||||
for (size_type j = 0; j < count; ++j) {
|
||||
for (size_type i = 0; i < length; ++i) {
|
||||
A(i, j) = (i + 1) * (j + 1);
|
||||
}
|
||||
}
|
||||
|
||||
double dt_min = 0;
|
||||
Kokkos::deep_copy(Q_, A);
|
||||
|
||||
for (size_t i = 0; i < iter; ++i) {
|
||||
Kokkos::deep_copy(Q_, A);
|
||||
// A = Q * R
|
||||
const double dt = factorization(Q_, R_);
|
||||
|
||||
// A = Q * R
|
||||
|
||||
const double dt = factorization(Q_, R_);
|
||||
|
||||
if (0 == i)
|
||||
dt_min = dt;
|
||||
else
|
||||
dt_min = dt < dt_min ? dt : dt_min;
|
||||
}
|
||||
|
||||
return dt_min;
|
||||
return dt;
|
||||
}
|
||||
};
|
||||
|
||||
template <class DeviceType>
|
||||
void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials,
|
||||
const char deviceTypeName[]) {
|
||||
std::string label_gramschmidt;
|
||||
label_gramschmidt.append("\"GramSchmidt< double , ");
|
||||
label_gramschmidt.append(deviceTypeName);
|
||||
label_gramschmidt.append(" >\"");
|
||||
template <class Scalar>
|
||||
static void GramSchmidt(benchmark::State& state) {
|
||||
const int parallel_work_length = state.range(0);
|
||||
|
||||
for (int i = exp_beg; i < exp_end; ++i) {
|
||||
double min_seconds = 0.0;
|
||||
double max_seconds = 0.0;
|
||||
double avg_seconds = 0.0;
|
||||
for (auto _ : state) {
|
||||
const double seconds =
|
||||
ModifiedGramSchmidt<Scalar, Kokkos::DefaultExecutionSpace>::test(
|
||||
parallel_work_length, 32);
|
||||
|
||||
const int parallel_work_length = 1 << i;
|
||||
|
||||
for (int j = 0; j < num_trials; ++j) {
|
||||
const double seconds = ModifiedGramSchmidt<double, DeviceType>::test(
|
||||
parallel_work_length, 32);
|
||||
|
||||
if (0 == j) {
|
||||
min_seconds = seconds;
|
||||
max_seconds = seconds;
|
||||
} else {
|
||||
if (seconds < min_seconds) min_seconds = seconds;
|
||||
if (seconds > max_seconds) max_seconds = seconds;
|
||||
}
|
||||
avg_seconds += seconds;
|
||||
}
|
||||
avg_seconds /= num_trials;
|
||||
|
||||
std::cout << label_gramschmidt << " , " << parallel_work_length << " , "
|
||||
<< min_seconds << " , " << (min_seconds / parallel_work_length)
|
||||
<< ", " << avg_seconds << std::endl;
|
||||
state.SetIterationTime(seconds);
|
||||
state.counters["Count"] = benchmark::Counter(parallel_work_length);
|
||||
state.counters["Time normalized"] =
|
||||
benchmark::Counter(seconds / parallel_work_length);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(default_exec, gramschmidt) {
|
||||
int exp_beg = 10;
|
||||
int exp_end = 20;
|
||||
int num_trials = 5;
|
||||
|
||||
if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1));
|
||||
if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2));
|
||||
if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
|
||||
|
||||
EXPECT_NO_THROW(run_test_gramschmidt<Kokkos::DefaultExecutionSpace>(
|
||||
exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()));
|
||||
}
|
||||
// FIXME_SYCL SYCL+Cuda reports "an illegal memory access was encountered"
|
||||
#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
|
||||
BENCHMARK(GramSchmidt<double>)
|
||||
->ArgName("Count")
|
||||
->ArgsProduct({
|
||||
benchmark::CreateRange(1 << 10, 1 << 18, 2),
|
||||
})
|
||||
->UseManualTime()
|
||||
->Iterations(5);
|
||||
#else
|
||||
BENCHMARK(GramSchmidt<double>)
|
||||
->ArgName("Count")
|
||||
->ArgsProduct({
|
||||
benchmark::CreateRange(1 << 10, 1 << 19, 2),
|
||||
})
|
||||
->UseManualTime()
|
||||
->Iterations(5);
|
||||
#endif
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -15,8 +15,9 @@
|
||||
//@HEADER
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <PerfTest_Category.hpp>
|
||||
#include <benchmark/benchmark.h>
|
||||
#include "Benchmark_Context.hpp"
|
||||
#include "PerfTest_Category.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
@ -195,78 +196,43 @@ struct HexGrad {
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
static double test(const int count, const int iter = 1) {
|
||||
static double test(const int count) {
|
||||
elem_coord_type coord("coord", count);
|
||||
elem_grad_type grad("grad", count);
|
||||
|
||||
// Execute the parallel kernels on the arrays:
|
||||
|
||||
double dt_min = 0;
|
||||
|
||||
Kokkos::parallel_for(count, Init(coord));
|
||||
execution_space().fence();
|
||||
|
||||
for (int i = 0; i < iter; ++i) {
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for(count, HexGrad<execution_space>(coord, grad));
|
||||
execution_space().fence();
|
||||
const double dt = timer.seconds();
|
||||
if (0 == i)
|
||||
dt_min = dt;
|
||||
else
|
||||
dt_min = dt < dt_min ? dt : dt_min;
|
||||
}
|
||||
|
||||
return dt_min;
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for(count, HexGrad<execution_space>(coord, grad));
|
||||
execution_space().fence();
|
||||
return timer.seconds();
|
||||
}
|
||||
};
|
||||
|
||||
template <class DeviceType>
|
||||
void run_test_hexgrad(int exp_beg, int exp_end, int num_trials,
|
||||
const char deviceTypeName[]) {
|
||||
std::string label_hexgrad;
|
||||
label_hexgrad.append("\"HexGrad< double , ");
|
||||
label_hexgrad.append(deviceTypeName);
|
||||
label_hexgrad.append(" >\"");
|
||||
template <class CoordScalarType>
|
||||
static void HexGrad_Benchmark(benchmark::State& state) {
|
||||
const auto parallel_work_length = state.range(0);
|
||||
|
||||
for (int i = exp_beg; i < exp_end; ++i) {
|
||||
double min_seconds = 0.0;
|
||||
double max_seconds = 0.0;
|
||||
double avg_seconds = 0.0;
|
||||
for (auto _ : state) {
|
||||
const auto time =
|
||||
HexGrad<Kokkos::DefaultExecutionSpace, CoordScalarType>::test(
|
||||
parallel_work_length);
|
||||
|
||||
const int parallel_work_length = 1 << i;
|
||||
|
||||
for (int j = 0; j < num_trials; ++j) {
|
||||
const double seconds = HexGrad<DeviceType>::test(parallel_work_length);
|
||||
|
||||
if (0 == j) {
|
||||
min_seconds = seconds;
|
||||
max_seconds = seconds;
|
||||
} else {
|
||||
if (seconds < min_seconds) min_seconds = seconds;
|
||||
if (seconds > max_seconds) max_seconds = seconds;
|
||||
}
|
||||
avg_seconds += seconds;
|
||||
}
|
||||
avg_seconds /= num_trials;
|
||||
|
||||
std::cout << label_hexgrad << " , " << parallel_work_length << " , "
|
||||
<< min_seconds << " , " << (min_seconds / parallel_work_length)
|
||||
<< avg_seconds << std::endl;
|
||||
state.SetIterationTime(time);
|
||||
state.counters["Count"] = benchmark::Counter(parallel_work_length);
|
||||
state.counters["Time normalized"] =
|
||||
benchmark::Counter(time / parallel_work_length);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(default_exec, hexgrad) {
|
||||
int exp_beg = 10;
|
||||
int exp_end = 20;
|
||||
int num_trials = 5;
|
||||
|
||||
if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1));
|
||||
if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2));
|
||||
if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
|
||||
|
||||
EXPECT_NO_THROW(run_test_hexgrad<Kokkos::DefaultExecutionSpace>(
|
||||
exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()));
|
||||
}
|
||||
BENCHMARK(HexGrad_Benchmark<double>)
|
||||
->ArgName("count")
|
||||
->ArgsProduct({
|
||||
benchmark::CreateRange(1 << 10, 1 << 19, 2),
|
||||
})
|
||||
->UseManualTime()
|
||||
->Iterations(5);
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -18,24 +18,7 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
namespace Test {
|
||||
int command_line_num_args(int n = 0) {
|
||||
static int n_args = 0;
|
||||
if (n > 0) n_args = n;
|
||||
return n_args;
|
||||
}
|
||||
|
||||
const char* command_line_arg(int k, char** input_args = nullptr) {
|
||||
static char** args;
|
||||
if (input_args != nullptr) args = input_args;
|
||||
if (command_line_num_args() > k)
|
||||
return args[k];
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
#include <PerfTest_Category.hpp>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
|
||||
@ -17,12 +17,22 @@
|
||||
#ifndef KOKKOS_TEST_PERFTEST_CAT_HPP
|
||||
#define KOKKOS_TEST_PERFTEST_CAT_HPP
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
namespace Test {
|
||||
|
||||
extern int command_line_num_args(int n = 0);
|
||||
extern const char* command_line_arg(int k, char** input_args = nullptr);
|
||||
inline int command_line_num_args(int n = 0) {
|
||||
static int n_args = 0;
|
||||
if (n > 0) n_args = n;
|
||||
return n_args;
|
||||
}
|
||||
|
||||
inline const char* command_line_arg(int k, char** input_args = nullptr) {
|
||||
static char** args;
|
||||
if (input_args != nullptr) args = input_args;
|
||||
if (command_line_num_args() > k)
|
||||
return args[k];
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
|
||||
|
||||
@ -15,14 +15,16 @@
|
||||
//@HEADER
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <PerfTest_Category.hpp>
|
||||
#include <benchmark/benchmark.h>
|
||||
#include "Benchmark_Context.hpp"
|
||||
#include "PerfTest_Category.hpp"
|
||||
#include <Kokkos_Random.hpp>
|
||||
#include <utility>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
||||
namespace Test {
|
||||
template <class Scalar>
|
||||
void custom_reduction_test(int N, int R, int num_trials) {
|
||||
std::pair<double, Scalar> custom_reduction_test(int N, int R) {
|
||||
Kokkos::Random_XorShift64_Pool<> rand_pool(183291);
|
||||
Kokkos::View<Scalar*> a("A", N);
|
||||
Kokkos::fill_random(a, rand_pool, 1.0);
|
||||
@ -62,49 +64,70 @@ void custom_reduction_test(int N, int R, int num_trials) {
|
||||
|
||||
// Timing
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < num_trials; r++) {
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::TeamPolicy<>(N / 1024, team_size),
|
||||
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team,
|
||||
Scalar& lmax) {
|
||||
Scalar team_max = Scalar(0);
|
||||
for (int rr = 0; rr < R; rr++) {
|
||||
int i = team.league_rank();
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::TeamThreadRange(team, 32),
|
||||
[&](const int& j, Scalar& thread_max) {
|
||||
Scalar t_max = Scalar(0);
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::ThreadVectorRange(team, 32),
|
||||
[&](const int& k, Scalar& max_) {
|
||||
const Scalar val = a((i * 32 + j) * 32 + k);
|
||||
if (val > max_) max_ = val;
|
||||
if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5;
|
||||
},
|
||||
Kokkos::Max<Scalar>(t_max));
|
||||
if (t_max > thread_max) thread_max = t_max;
|
||||
},
|
||||
Kokkos::Max<Scalar>(team_max));
|
||||
}
|
||||
if (team_max > lmax) lmax = team_max;
|
||||
},
|
||||
Kokkos::Max<Scalar>(max));
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::TeamPolicy<>(N / 1024, team_size),
|
||||
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team,
|
||||
Scalar& lmax) {
|
||||
Scalar team_max = Scalar(0);
|
||||
for (int rr = 0; rr < R; rr++) {
|
||||
int i = team.league_rank();
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::TeamThreadRange(team, 32),
|
||||
[&](const int& j, Scalar& thread_max) {
|
||||
Scalar t_max = Scalar(0);
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::ThreadVectorRange(team, 32),
|
||||
[&](const int& k, Scalar& max_) {
|
||||
const Scalar val = a((i * 32 + j) * 32 + k);
|
||||
if (val > max_) max_ = val;
|
||||
if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5;
|
||||
},
|
||||
Kokkos::Max<Scalar>(t_max));
|
||||
if (t_max > thread_max) thread_max = t_max;
|
||||
},
|
||||
Kokkos::Max<Scalar>(team_max));
|
||||
}
|
||||
if (team_max > lmax) lmax = team_max;
|
||||
},
|
||||
Kokkos::Max<Scalar>(max));
|
||||
|
||||
return std::make_pair(timer.seconds(), max);
|
||||
}
|
||||
|
||||
int get_N(benchmark::State& state) {
|
||||
return (Test::command_line_num_args() > 1)
|
||||
? std::stoi(Test::command_line_arg(1))
|
||||
: state.range(0);
|
||||
}
|
||||
|
||||
int get_R(benchmark::State& state) {
|
||||
return (Test::command_line_num_args() > 2)
|
||||
? std::stoi(Test::command_line_arg(2))
|
||||
: state.range(1);
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
static void CustomReduction(benchmark::State& state) {
|
||||
int N = get_N(state);
|
||||
int R = get_R(state);
|
||||
|
||||
for (auto _ : state) {
|
||||
auto results = custom_reduction_test<double>(N, R);
|
||||
// data processed in gigabytes
|
||||
const double data_processed =
|
||||
N * R * sizeof(Scalar) / results.first / 1'000'000'000;
|
||||
|
||||
state.SetIterationTime(results.first);
|
||||
state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
|
||||
data_processed, benchmark::Counter::kIsIterationInvariantRate);
|
||||
state.counters["Max"] = benchmark::Counter(results.second);
|
||||
}
|
||||
double time = timer.seconds();
|
||||
printf("%e %e %e\n", time,
|
||||
1.0 * N * R * num_trials * sizeof(Scalar) / time / 1024 / 1024 / 1024,
|
||||
max);
|
||||
}
|
||||
|
||||
TEST(default_exec, custom_reduction) {
|
||||
int N = 100000;
|
||||
int R = 1000;
|
||||
int num_trials = 1;
|
||||
BENCHMARK(CustomReduction<double>)
|
||||
->ArgNames({"N", "R"})
|
||||
->Args({100'000, 1'000})
|
||||
->UseManualTime();
|
||||
|
||||
if (command_line_num_args() > 1) N = std::stoi(command_line_arg(1));
|
||||
if (command_line_num_args() > 2) R = std::stoi(command_line_arg(2));
|
||||
if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
|
||||
custom_reduction_test<double>(N, R, num_trials);
|
||||
}
|
||||
} // namespace Test
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -15,119 +15,218 @@
|
||||
//@HEADER
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdio>
|
||||
#include <PerfTest_Category.hpp>
|
||||
#include <benchmark/benchmark.h>
|
||||
#include "Benchmark_Context.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
static constexpr int N = 10;
|
||||
|
||||
template <class Layout>
|
||||
void run_allocateview_tests(int N, int R) {
|
||||
const int N1 = N;
|
||||
const int N2 = N * N;
|
||||
const int N3 = N2 * N;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
static void ViewAllocate_Rank1(benchmark::State& state) {
|
||||
const int N8 = std::pow(state.range(0), 8);
|
||||
|
||||
double time1, time2, time3, time4, time5, time6, time7, time8,
|
||||
time_raw = 100000.0;
|
||||
{
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
}
|
||||
time1 = timer.seconds() / R;
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
|
||||
}
|
||||
{
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewAllocate_Rank2(benchmark::State& state) {
|
||||
const int N4 = std::pow(state.range(0), 4);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double**, Layout> a("A2", N4, N4);
|
||||
}
|
||||
time2 = timer.seconds() / R;
|
||||
Kokkos::View<double**, Layout> a("A2", N4, N4);
|
||||
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
|
||||
}
|
||||
{
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewAllocate_Rank3(benchmark::State& state) {
|
||||
const int N2 = std::pow(state.range(0), 2);
|
||||
const int N3 = std::pow(state.range(0), 3);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
|
||||
}
|
||||
time3 = timer.seconds() / R;
|
||||
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
|
||||
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
|
||||
}
|
||||
{
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewAllocate_Rank4(benchmark::State& state) {
|
||||
const int N2 = std::pow(state.range(0), 2);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
|
||||
}
|
||||
time4 = timer.seconds() / R;
|
||||
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
|
||||
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
|
||||
}
|
||||
{
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewAllocate_Rank5(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
|
||||
}
|
||||
time5 = timer.seconds() / R;
|
||||
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
|
||||
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
|
||||
}
|
||||
{
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewAllocate_Rank6(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
|
||||
}
|
||||
time6 = timer.seconds() / R;
|
||||
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
|
||||
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
|
||||
}
|
||||
{
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewAllocate_Rank7(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
|
||||
}
|
||||
time7 = timer.seconds() / R;
|
||||
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
|
||||
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
|
||||
}
|
||||
{
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewAllocate_Rank8(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
|
||||
N1);
|
||||
}
|
||||
time8 = timer.seconds() / R;
|
||||
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
|
||||
N1);
|
||||
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewAllocate_Raw(benchmark::State& state) {
|
||||
const int N8 = std::pow(state.range(0), 8);
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
double* a_ptr =
|
||||
static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
|
||||
Kokkos::fence();
|
||||
const auto time = timer.seconds();
|
||||
Kokkos::kokkos_free(a_ptr);
|
||||
|
||||
state.SetIterationTime(time);
|
||||
// data processed in megabytes
|
||||
const double data_processed = 1 * N8 * sizeof(double) / 1'000'000;
|
||||
state.counters["MB"] = benchmark::Counter(data_processed);
|
||||
state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
|
||||
data_processed / 1'000, benchmark::Counter::kIsIterationInvariantRate);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank1<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank1<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank2<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank2<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank3<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank3<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank4<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank4<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank5<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank5<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank6<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank6<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank7<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank7<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank8<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewAllocate_Rank8<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
double* a_ptr =
|
||||
static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
|
||||
Kokkos::fence();
|
||||
Kokkos::kokkos_free(a_ptr);
|
||||
}
|
||||
time_raw = timer.seconds() / R;
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
size / 1024 / time_raw);
|
||||
printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size,
|
||||
size / 1024 / time1);
|
||||
printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size,
|
||||
size / 1024 / time2);
|
||||
printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size,
|
||||
size / 1024 / time3);
|
||||
printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size,
|
||||
size / 1024 / time4);
|
||||
printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size,
|
||||
size / 1024 / time5);
|
||||
printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size,
|
||||
size / 1024 / time6);
|
||||
printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size,
|
||||
size / 1024 / time7);
|
||||
printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size,
|
||||
size / 1024 / time8);
|
||||
}
|
||||
BENCHMARK(ViewAllocate_Raw<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
TEST(default_exec, ViewCreate) {
|
||||
printf("Create View Performance for LayoutLeft:\n");
|
||||
run_allocateview_tests<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("Create View Performance for LayoutRight:\n");
|
||||
run_allocateview_tests<Kokkos::LayoutRight>(10, 1);
|
||||
}
|
||||
BENCHMARK(ViewAllocate_Raw<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
#endif
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -18,32 +18,15 @@
|
||||
#define KOKKOS_CORE_PERFTEST_BENCHMARK_VIEW_COPY_HPP
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include "Benchmark_Context.hpp"
|
||||
#include <cmath>
|
||||
|
||||
namespace Test {
|
||||
|
||||
/**
|
||||
* \brief Mark the label as a figure of merit.
|
||||
*/
|
||||
inline std::string benchmark_fom(const std::string& label) {
|
||||
return "FOM: " + label;
|
||||
}
|
||||
|
||||
inline void report_results(benchmark::State& state, std::size_t num_elems,
|
||||
double time) {
|
||||
state.SetIterationTime(time);
|
||||
|
||||
// data size in megabytes
|
||||
const auto size = 1.0 * num_elems * sizeof(double) / 1000 / 1000;
|
||||
// data processed in gigabytes
|
||||
const auto data_processed = 2 * size / 1000;
|
||||
|
||||
state.counters["MB"] =
|
||||
benchmark::Counter(size, benchmark::Counter::kDefaults);
|
||||
state.counters[benchmark_fom("GB/s")] = benchmark::Counter(
|
||||
data_processed, benchmark::Counter::kIsIterationInvariantRate);
|
||||
}
|
||||
static constexpr int DATA_RATIO = 2;
|
||||
|
||||
template <class ViewTypeA, class ViewTypeB>
|
||||
void deepcopy_view(ViewTypeA& a, ViewTypeB& b, benchmark::State& state) {
|
||||
@ -51,7 +34,7 @@ void deepcopy_view(ViewTypeA& a, ViewTypeB& b, benchmark::State& state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::deep_copy(a, b);
|
||||
report_results(state, a.size(), timer.seconds());
|
||||
KokkosBenchmark::report_results(state, a, DATA_RATIO, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
@ -158,8 +141,7 @@ static void ViewDeepCopy_Raw(benchmark::State& state) {
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = b_ptr[i]; });
|
||||
Kokkos::fence();
|
||||
|
||||
report_results(state, a.size(), timer.seconds());
|
||||
KokkosBenchmark::report_results(state, a, DATA_RATIO, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewCopy.hpp>
|
||||
#include "PerfTest_ViewCopy.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
||||
@ -14,202 +14,115 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdio>
|
||||
#include <PerfTest_Category.hpp>
|
||||
#include "Benchmark_Context.hpp"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
namespace Test {
|
||||
|
||||
static constexpr int N = 10;
|
||||
|
||||
template <class ViewType>
|
||||
double fill_view(ViewType& a, typename ViewType::const_value_type& val,
|
||||
int repeat) {
|
||||
Kokkos::Timer timer;
|
||||
for (int i = 0; i < repeat; i++) {
|
||||
void fill_view(ViewType& a, typename ViewType::const_value_type& val,
|
||||
benchmark::State& state) {
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::deep_copy(a, val);
|
||||
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
|
||||
}
|
||||
Kokkos::fence();
|
||||
return timer.seconds();
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
void run_fillview_tests123(int N, int R) {
|
||||
const int N1 = N;
|
||||
static void ViewFill_Rank1(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
fill_view(a, 1.1, state);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewFill_Rank2(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
|
||||
Kokkos::View<double**, Layout> a("A2", N4, N4);
|
||||
fill_view(a, 1.1, state);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewFill_Rank3(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
const int N3 = N2 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
|
||||
double time1, time2, time3, time_raw = 100000.0;
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
time1 = fill_view(a, 1.1, R) / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double**, Layout> a("A2", N4, N4);
|
||||
time2 = fill_view(a, 1.1, R) / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
|
||||
time3 = fill_view(a, 1.1, R) / R;
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds() / R;
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
size / 1024 / time_raw);
|
||||
printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size,
|
||||
size / 1024 / time1);
|
||||
printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size,
|
||||
size / 1024 / time2);
|
||||
printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size,
|
||||
size / 1024 / time3);
|
||||
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
|
||||
fill_view(a, 1.1, state);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
void run_fillview_tests45(int N, int R) {
|
||||
const int N1 = N;
|
||||
static void ViewFill_Rank4(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
|
||||
double time4, time5, time_raw = 100000.0;
|
||||
{
|
||||
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
|
||||
time4 = fill_view(a, 1.1, R) / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
|
||||
time5 = fill_view(a, 1.1, R) / R;
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds() / R;
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
size / 1024 / time_raw);
|
||||
printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size,
|
||||
size / 1024 / time4);
|
||||
printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size,
|
||||
size / 1024 / time5);
|
||||
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
|
||||
fill_view(a, 1.1, state);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
void run_fillview_tests6(int N, int R) {
|
||||
const int N1 = N;
|
||||
static void ViewFill_Rank5(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
|
||||
double time6, time_raw = 100000.0;
|
||||
{
|
||||
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
|
||||
time6 = fill_view(a, 1.1, R) / R;
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds() / R;
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
size / 1024 / time_raw);
|
||||
printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size,
|
||||
size / 1024 / time6);
|
||||
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
|
||||
fill_view(a, 1.1, state);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
void run_fillview_tests7(int N, int R) {
|
||||
const int N1 = N;
|
||||
static void ViewFill_Rank6(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
|
||||
double time7, time_raw = 100000.0;
|
||||
{
|
||||
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
|
||||
time7 = fill_view(a, 1.1, R) / R;
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
|
||||
}
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds() / R;
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
size / 1024 / time_raw);
|
||||
printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size,
|
||||
size / 1024 / time7);
|
||||
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
|
||||
fill_view(a, 1.1, state);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
void run_fillview_tests8(int N, int R) {
|
||||
const int N1 = N;
|
||||
static void ViewFill_Rank7(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
|
||||
double time8, time_raw = 100000.0;
|
||||
{
|
||||
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
|
||||
N1);
|
||||
time8 = fill_view(a, 1.1, R) / R;
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
|
||||
fill_view(a, 1.1, state);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewFill_Rank8(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
|
||||
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, N1);
|
||||
fill_view(a, 1.1, state);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewFill_Raw(benchmark::State& state) {
|
||||
const int N8 = std::pow(state.range(0), 8);
|
||||
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
|
||||
}
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds() / R;
|
||||
|
||||
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
size / 1024 / time_raw);
|
||||
printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size,
|
||||
size / 1024 / time8);
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -14,13 +14,38 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewFill.hpp>
|
||||
#include "PerfTest_ViewFill.hpp"
|
||||
|
||||
namespace Test {
|
||||
TEST(default_exec, ViewFill_Rank123) {
|
||||
printf("ViewFill Performance for LayoutLeft:\n");
|
||||
run_fillview_tests123<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("ViewFill Performance for LayoutRight:\n");
|
||||
run_fillview_tests123<Kokkos::LayoutRight>(10, 1);
|
||||
}
|
||||
|
||||
BENCHMARK(ViewFill_Rank1<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank1<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank2<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank2<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank3<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank3<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -14,13 +14,28 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewFill.hpp>
|
||||
#include "PerfTest_ViewFill.hpp"
|
||||
|
||||
namespace Test {
|
||||
TEST(default_exec, ViewFill_Rank45) {
|
||||
printf("ViewFill Performance for LayoutLeft:\n");
|
||||
run_fillview_tests45<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("ViewFill Performance for LayoutRight:\n");
|
||||
run_fillview_tests45<Kokkos::LayoutRight>(10, 1);
|
||||
}
|
||||
|
||||
BENCHMARK(ViewFill_Rank4<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank4<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank5<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank5<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -14,13 +14,18 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewFill.hpp>
|
||||
#include "PerfTest_ViewFill.hpp"
|
||||
|
||||
namespace Test {
|
||||
TEST(default_exec, ViewFill_Rank6) {
|
||||
printf("ViewFill Performance for LayoutLeft:\n");
|
||||
run_fillview_tests6<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("ViewFill Performance for LayoutRight:\n");
|
||||
run_fillview_tests6<Kokkos::LayoutRight>(10, 1);
|
||||
}
|
||||
|
||||
BENCHMARK(ViewFill_Rank6<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank6<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -14,13 +14,18 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewFill.hpp>
|
||||
#include "PerfTest_ViewFill.hpp"
|
||||
|
||||
namespace Test {
|
||||
TEST(default_exec, ViewFill_Rank7) {
|
||||
printf("ViewFill Performance for LayoutLeft:\n");
|
||||
run_fillview_tests7<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("ViewFill Performance for LayoutRight:\n");
|
||||
run_fillview_tests7<Kokkos::LayoutRight>(10, 1);
|
||||
}
|
||||
|
||||
BENCHMARK(ViewFill_Rank7<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank7<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -14,13 +14,18 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewFill.hpp>
|
||||
#include "PerfTest_ViewFill.hpp"
|
||||
|
||||
namespace Test {
|
||||
TEST(default_exec, ViewFill_Rank8) {
|
||||
printf("ViewFill Performance for LayoutLeft:\n");
|
||||
run_fillview_tests8<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("ViewFill Performance for LayoutRight:\n");
|
||||
run_fillview_tests8<Kokkos::LayoutRight>(10, 1);
|
||||
}
|
||||
|
||||
BENCHMARK(ViewFill_Rank8<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Rank8<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
} // namespace Test
|
||||
|
||||
33
lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp
Normal file
33
lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp
Normal file
@ -0,0 +1,33 @@
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 4.0
|
||||
// Copyright (2022) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://kokkos.org/LICENSE for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include "PerfTest_ViewFill.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
BENCHMARK(ViewFill_Raw<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
|
||||
BENCHMARK(ViewFill_Raw<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime();
|
||||
#endif
|
||||
|
||||
} // namespace Test
|
||||
@ -15,346 +15,291 @@
|
||||
//@HEADER
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdio>
|
||||
#include <PerfTest_Category.hpp>
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cmath>
|
||||
#include "Benchmark_Context.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
template <class Layout>
|
||||
void run_resizeview_tests123(int N, int R) {
|
||||
const int N1 = N;
|
||||
const int N2 = N1 * N1;
|
||||
const int N3 = N2 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
static constexpr int R = 10;
|
||||
static constexpr int N = 10;
|
||||
|
||||
double time1, time2, time3, time_raw = 100000.0;
|
||||
double time1_noinit, time2_noinit, time3_noinit;
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*, Layout> a_(a);
|
||||
Kokkos::resize(a_, int(N8 * 1.1));
|
||||
}
|
||||
time1 = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double**, Layout> a("A2", N4, N4);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double**, Layout> a_(a);
|
||||
Kokkos::resize(a_, int(N4 * 1.1), N4);
|
||||
}
|
||||
time2 = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double***, Layout> a_(a);
|
||||
Kokkos::resize(a_, int(N3 * 1.1), N3, N2);
|
||||
}
|
||||
time3 = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*, Layout> a_(a);
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N8 * 1.1));
|
||||
}
|
||||
time1_noinit = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double**, Layout> a("A2", N4, N4);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double**, Layout> a_(a);
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N4 * 1.1), N4);
|
||||
}
|
||||
time2_noinit = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double***, Layout> a_(a);
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N3 * 1.1), N3, N2);
|
||||
}
|
||||
time3_noinit = timer.seconds() / R;
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*, Layout> a1(
|
||||
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
|
||||
double* a1_ptr = a1.data();
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
|
||||
Kokkos::fence();
|
||||
}
|
||||
template <class Layout>
|
||||
static void ViewResize_Rank1(benchmark::State& state) {
|
||||
const int N8 = std::pow(state.range(0), 8);
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
Kokkos::View<double*, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds() / R;
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(a_, int(N8 * 1.1));
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
2.0 * size / 1024 / time_raw);
|
||||
printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size,
|
||||
2.0 * size / 1024 / time1);
|
||||
printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size,
|
||||
2.0 * size / 1024 / time2);
|
||||
printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size,
|
||||
2.0 * size / 1024 / time3);
|
||||
printf(" Rank1 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
|
||||
time1_noinit, size, 2.0 * size / 1024 / time1_noinit);
|
||||
printf(" Rank2 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
|
||||
time2_noinit, size, 2.0 * size / 1024 / time2_noinit);
|
||||
printf(" Rank3 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
|
||||
time3_noinit, size, 2.0 * size / 1024 / time3_noinit);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
void run_resizeview_tests45(int N, int R) {
|
||||
const int N1 = N;
|
||||
const int N2 = N1 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
static void ViewResize_Rank2(benchmark::State& state) {
|
||||
const int N4 = std::pow(state.range(0), 4);
|
||||
Kokkos::View<double**, Layout> a("A2", N4, N4);
|
||||
Kokkos::View<double**, Layout> a_(a);
|
||||
|
||||
double time4, time5, time_raw = 100000.0;
|
||||
double time4_noinit, time5_noinit;
|
||||
{
|
||||
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double****, Layout> a_(a);
|
||||
Kokkos::resize(a_, int(N2 * 1.1), N2, N2, N2);
|
||||
}
|
||||
time4 = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*****, Layout> a_(a);
|
||||
Kokkos::resize(a_, int(N2 * 1.1), N2, N1, N1, N2);
|
||||
}
|
||||
time5 = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double****, Layout> a_(a);
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N2,
|
||||
N2);
|
||||
}
|
||||
time4_noinit = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*****, Layout> a_(a);
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N1, N1,
|
||||
N2);
|
||||
}
|
||||
time5_noinit = timer.seconds() / R;
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*, Layout> a1(
|
||||
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
|
||||
double* a1_ptr = a1.data();
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
|
||||
Kokkos::fence();
|
||||
}
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds() / R;
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(a_, int(N4 * 1.1), N4);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
2.0 * size / 1024 / time_raw);
|
||||
printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size,
|
||||
2.0 * size / 1024 / time4);
|
||||
printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size,
|
||||
2.0 * size / 1024 / time5);
|
||||
printf(" Rank4 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
|
||||
time4_noinit, size, 2.0 * size / 1024 / time4_noinit);
|
||||
printf(" Rank5 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
|
||||
time5_noinit, size, 2.0 * size / 1024 / time5_noinit);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
void run_resizeview_tests6(int N, int R) {
|
||||
const int N1 = N;
|
||||
const int N2 = N1 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
static void ViewResize_Rank3(benchmark::State& state) {
|
||||
const int N2 = std::pow(state.range(0), 2);
|
||||
const int N3 = std::pow(state.range(0), 3);
|
||||
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
|
||||
Kokkos::View<double***, Layout> a_(a);
|
||||
|
||||
double time6, time6_noinit, time_raw = 100000.0;
|
||||
{
|
||||
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double******, Layout> a_(a);
|
||||
Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N2);
|
||||
}
|
||||
time6 = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double******, Layout> a_(a);
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
|
||||
N1, N2);
|
||||
}
|
||||
time6_noinit = timer.seconds() / R;
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*, Layout> a1(
|
||||
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
|
||||
double* a1_ptr = a1.data();
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
|
||||
Kokkos::fence();
|
||||
}
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds() / R;
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(a_, int(N3 * 1.1), N3, N2);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
2.0 * size / 1024 / time_raw);
|
||||
printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size,
|
||||
2.0 * size / 1024 / time6);
|
||||
printf(" Rank6 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
|
||||
time6_noinit, size, 2.0 * size / 1024 / time6_noinit);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
void run_resizeview_tests7(int N, int R) {
|
||||
const int N1 = N;
|
||||
const int N2 = N1 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
static void ViewResize_Rank4(benchmark::State& state) {
|
||||
const int N2 = std::pow(state.range(0), 2);
|
||||
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
|
||||
Kokkos::View<double****, Layout> a_(a);
|
||||
|
||||
double time7, time7_noinit, time_raw = 100000.0;
|
||||
{
|
||||
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*******, Layout> a_(a);
|
||||
Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1);
|
||||
}
|
||||
time7 = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*******, Layout> a_(a);
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
|
||||
N1, N1, N1);
|
||||
}
|
||||
time7_noinit = timer.seconds() / R;
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*, Layout> a1(
|
||||
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
|
||||
double* a1_ptr = a1.data();
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
|
||||
Kokkos::fence();
|
||||
}
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds() / R;
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(a_, int(N2 * 1.1), N2, N2, N2);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
2.0 * size / 1024 / time_raw);
|
||||
printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size,
|
||||
2.0 * size / 1024 / time7);
|
||||
printf(" Rank7 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
|
||||
time7_noinit, size, 2.0 * size / 1024 / time7_noinit);
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
void run_resizeview_tests8(int N, int R) {
|
||||
const int N1 = N;
|
||||
static void ViewResize_Rank5(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
const int N4 = N2 * N2;
|
||||
const int N8 = N4 * N4;
|
||||
|
||||
double time8, time8_noinit, time_raw = 100000.0;
|
||||
{
|
||||
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
|
||||
N1);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double********, Layout> a_(a);
|
||||
Kokkos::resize(a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1);
|
||||
}
|
||||
time8 = timer.seconds() / R;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
|
||||
N1);
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double********, Layout> a_(a);
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N1 * 1.1), N1, N1, N1,
|
||||
N1, N1, N1, N1);
|
||||
}
|
||||
time8_noinit = timer.seconds() / R;
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
{
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
Kokkos::Timer timer;
|
||||
for (int r = 0; r < R; r++) {
|
||||
Kokkos::View<double*, Layout> a1(
|
||||
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
|
||||
double* a1_ptr = a1.data();
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
|
||||
Kokkos::fence();
|
||||
}
|
||||
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
|
||||
Kokkos::View<double*****, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
time_raw = timer.seconds() / R;
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(a_, int(N2 * 1.1), N2, N1, N1, N2);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_Rank6(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
|
||||
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
|
||||
Kokkos::View<double******, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N2);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_Rank7(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
|
||||
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
|
||||
Kokkos::View<double*******, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_Rank8(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
|
||||
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, N1);
|
||||
Kokkos::View<double********, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_NoInit_Rank1(benchmark::State& state) {
|
||||
const int N8 = std::pow(state.range(0), 8);
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
Kokkos::View<double*, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N8 * 1.1));
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_NoInit_Rank2(benchmark::State& state) {
|
||||
const int N4 = std::pow(state.range(0), 4);
|
||||
Kokkos::View<double**, Layout> a("A2", N4, N4);
|
||||
Kokkos::View<double**, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N4 * 1.1), N4);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_NoInit_Rank3(benchmark::State& state) {
|
||||
const int N2 = std::pow(state.range(0), 2);
|
||||
const int N3 = std::pow(state.range(0), 3);
|
||||
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
|
||||
Kokkos::View<double***, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N3 * 1.1), N3, N2);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_NoInit_Rank4(benchmark::State& state) {
|
||||
const int N2 = std::pow(state.range(0), 2);
|
||||
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
|
||||
Kokkos::View<double****, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N2, N2);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_NoInit_Rank5(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
|
||||
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
|
||||
Kokkos::View<double*****, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N1, N1,
|
||||
N2);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_NoInit_Rank6(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
|
||||
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
|
||||
Kokkos::View<double******, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
|
||||
N1, N2);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_NoInit_Rank7(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
const int N2 = N1 * N1;
|
||||
|
||||
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
|
||||
Kokkos::View<double*******, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
|
||||
N1, N1, N1);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_NoInit_Rank8(benchmark::State& state) {
|
||||
const int N1 = state.range(0);
|
||||
|
||||
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, N1);
|
||||
Kokkos::View<double********, Layout> a_(a);
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::fence();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N1 * 1.1), N1, N1, N1,
|
||||
N1, N1, N1, N1);
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
}
|
||||
|
||||
template <class Layout>
|
||||
static void ViewResize_NoInit_Raw(benchmark::State& state) {
|
||||
const int N8 = std::pow(state.range(0), 8);
|
||||
Kokkos::View<double*, Layout> a("A1", N8);
|
||||
double* a_ptr = a.data();
|
||||
|
||||
for (auto _ : state) {
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::View<double*, Layout> a1(
|
||||
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
|
||||
double* a1_ptr = a1.data();
|
||||
Kokkos::parallel_for(
|
||||
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
|
||||
Kokkos::fence();
|
||||
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
|
||||
}
|
||||
#endif
|
||||
double size = 1.0 * N8 * 8 / 1024 / 1024;
|
||||
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
|
||||
2.0 * size / 1024 / time_raw);
|
||||
printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size,
|
||||
2.0 * size / 1024 / time8);
|
||||
printf(" Rank8 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
|
||||
time8_noinit, size, 2.0 * size / 1024 / time8_noinit);
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -14,15 +14,80 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewResize.hpp>
|
||||
#include "PerfTest_ViewResize.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
TEST(default_exec, ViewResize_Rank123) {
|
||||
printf("Resize View Performance for LayoutLeft:\n");
|
||||
run_resizeview_tests123<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("Resize View Performance for LayoutRight:\n");
|
||||
run_resizeview_tests123<Kokkos::LayoutRight>(10, 1);
|
||||
}
|
||||
BENCHMARK(ViewResize_Rank1<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank1<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank2<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank2<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank3<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank3<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank1<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank1<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank2<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank2<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank3<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank3<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -14,15 +14,56 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewResize.hpp>
|
||||
#include "PerfTest_ViewResize.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
TEST(default_exec, ViewResize_Rank_45) {
|
||||
printf("Resize View Performance for LayoutLeft:\n");
|
||||
run_resizeview_tests45<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("Resize View Performance for LayoutRight:\n");
|
||||
run_resizeview_tests45<Kokkos::LayoutRight>(10, 1);
|
||||
}
|
||||
BENCHMARK(ViewResize_Rank4<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank4<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank5<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank5<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank4<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank4<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank5<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank5<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -14,15 +14,32 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewResize.hpp>
|
||||
#include "PerfTest_ViewResize.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
TEST(default_exec, ViewResize_Rank6) {
|
||||
printf("Resize View Performance for LayoutLeft:\n");
|
||||
run_resizeview_tests6<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("Resize View Performance for LayoutRight:\n");
|
||||
run_resizeview_tests6<Kokkos::LayoutRight>(10, 1);
|
||||
}
|
||||
BENCHMARK(ViewResize_Rank6<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank6<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank6<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank6<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -14,15 +14,32 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewResize.hpp>
|
||||
#include "PerfTest_ViewResize.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
TEST(default_exec, ViewResize_Rank7) {
|
||||
printf("Resize View Performance for LayoutLeft:\n");
|
||||
run_resizeview_tests7<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("Resize View Performance for LayoutRight:\n");
|
||||
run_resizeview_tests7<Kokkos::LayoutRight>(10, 1);
|
||||
}
|
||||
BENCHMARK(ViewResize_Rank7<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank7<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank7<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank7<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
} // namespace Test
|
||||
|
||||
@ -14,23 +14,39 @@
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include <PerfTest_ViewResize.hpp>
|
||||
#include "PerfTest_ViewResize.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
TEST(default_exec, ViewResize_Rank8) {
|
||||
// FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI
|
||||
#ifdef KOKKOS_ENABLE_SYCL
|
||||
printf("Resize View Performance for LayoutLeft:\n");
|
||||
run_resizeview_tests8<Kokkos::LayoutLeft>(9, 1);
|
||||
printf("Resize View Performance for LayoutRight:\n");
|
||||
run_resizeview_tests8<Kokkos::LayoutRight>(9, 1);
|
||||
static constexpr int N_8 = N - 1;
|
||||
#else
|
||||
printf("Resize View Performance for LayoutLeft:\n");
|
||||
run_resizeview_tests8<Kokkos::LayoutLeft>(10, 1);
|
||||
printf("Resize View Performance for LayoutRight:\n");
|
||||
run_resizeview_tests8<Kokkos::LayoutRight>(10, 1);
|
||||
static constexpr int N_8 = N;
|
||||
#endif
|
||||
}
|
||||
|
||||
BENCHMARK(ViewResize_Rank8<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N_8)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_Rank8<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N_8)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank8<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N_8)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Rank8<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N_8)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
} // namespace Test
|
||||
|
||||
35
lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp
Normal file
35
lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp
Normal file
@ -0,0 +1,35 @@
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 4.0
|
||||
// Copyright (2022) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://kokkos.org/LICENSE for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//@HEADER
|
||||
|
||||
#include "PerfTest_ViewResize.hpp"
|
||||
|
||||
namespace Test {
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
|
||||
BENCHMARK(ViewResize_NoInit_Raw<Kokkos::LayoutLeft>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
|
||||
BENCHMARK(ViewResize_NoInit_Raw<Kokkos::LayoutRight>)
|
||||
->ArgName("N")
|
||||
->Arg(N)
|
||||
->UseManualTime()
|
||||
->Iterations(R);
|
||||
#endif
|
||||
|
||||
} // namespace Test
|
||||
@ -18,38 +18,14 @@
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
#include "Benchmark_Context.hpp"
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_Timer.hpp>
|
||||
|
||||
using exec_space = Kokkos::DefaultExecutionSpace;
|
||||
|
||||
#define RESET 0
|
||||
#define BRIGHT 1
|
||||
#define DIM 2
|
||||
#define UNDERLINE 3
|
||||
#define BLINK 4
|
||||
#define REVERSE 7
|
||||
#define HIDDEN 8
|
||||
|
||||
#define BLACK 0
|
||||
#define RED 1
|
||||
#define GREEN 2
|
||||
#define YELLOW 3
|
||||
#define BLUE 4
|
||||
#define MAGENTA 5
|
||||
#define CYAN 6
|
||||
#define GREY 7
|
||||
#define WHITE 8
|
||||
|
||||
void textcolor(int attr, int fg, int bg) {
|
||||
char command[40];
|
||||
|
||||
/* Command is the control command to the terminal */
|
||||
snprintf(command, 40, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
|
||||
printf("%s", command);
|
||||
}
|
||||
void textcolor_standard() { textcolor(RESET, BLACK, WHITE); }
|
||||
|
||||
template <class T, class DEVICE_TYPE>
|
||||
struct ZeroFunctor {
|
||||
using execution_space = DEVICE_TYPE;
|
||||
@ -370,7 +346,9 @@ T LoopVariantNonAtomic(int loop, int test) {
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void Loop(int loop, int test, const char* type_name) {
|
||||
void Loop(benchmark::State& state, int test) {
|
||||
int loop = state.range(0);
|
||||
|
||||
LoopVariant<T>(loop, test);
|
||||
|
||||
Kokkos::Timer timer;
|
||||
@ -388,86 +366,36 @@ void Loop(int loop, int test, const char* type_name) {
|
||||
time *= 1e6 / loop;
|
||||
timeNonAtomic *= 1e6 / loop;
|
||||
timeSerial *= 1e6 / loop;
|
||||
// textcolor_standard();
|
||||
bool passed = true;
|
||||
if (resSerial != res) passed = false;
|
||||
// if(!passed) textcolor(RESET,BLACK,YELLOW);
|
||||
printf(
|
||||
"%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e "
|
||||
"%7.4e Size of Type %i)",
|
||||
type_name, test, passed ? "PASSED" : "FAILED", loop, 1.0 * resSerial,
|
||||
1.0 * res, 1.0 * resNonAtomic, timeSerial, time, timeNonAtomic,
|
||||
(int)sizeof(T));
|
||||
// if(!passed) textcolor_standard();
|
||||
printf("\n");
|
||||
|
||||
bool passed = (resSerial == res);
|
||||
|
||||
state.counters["Passed"] = benchmark::Counter(passed);
|
||||
state.counters["Value serial"] = benchmark::Counter(resSerial);
|
||||
state.counters["Value atomic"] = benchmark::Counter(res);
|
||||
state.counters["Value non-atomic"] = benchmark::Counter(resNonAtomic);
|
||||
state.counters["Time serial"] = benchmark::Counter(timeSerial);
|
||||
state.counters["Time atomic"] = benchmark::Counter(time);
|
||||
state.counters["Time non-atomic"] = benchmark::Counter(timeNonAtomic);
|
||||
state.counters["Size of type"] = benchmark::Counter(sizeof(T));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void Test(int loop, int test, const char* type_name) {
|
||||
if (test == -1) {
|
||||
Loop<T>(loop, 1, type_name);
|
||||
Loop<T>(loop, 2, type_name);
|
||||
Loop<T>(loop, 3, type_name);
|
||||
|
||||
} else
|
||||
Loop<T>(loop, test, type_name);
|
||||
static void Test_Atomic(benchmark::State& state) {
|
||||
for (auto _ : state) {
|
||||
Loop<T>(state, 1);
|
||||
Loop<T>(state, 2);
|
||||
Loop<T>(state, 3);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
int type = -1;
|
||||
int loop = 100000;
|
||||
int test = -1;
|
||||
static constexpr int LOOP = 100'000;
|
||||
|
||||
for (int i = 0; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "--test") == 0)) {
|
||||
test = std::stoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "--type") == 0)) {
|
||||
type = std::stoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-l") == 0) || (strcmp(argv[i], "--loop") == 0)) {
|
||||
loop = std::stoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
Kokkos::initialize(argc, argv);
|
||||
|
||||
printf("Using %s\n", Kokkos::atomic_query_version());
|
||||
bool all_tests = false;
|
||||
if (type == -1) all_tests = true;
|
||||
while (type < 100) {
|
||||
if (type == 1) {
|
||||
Test<int>(loop, test, "int ");
|
||||
}
|
||||
if (type == 2) {
|
||||
Test<long int>(loop, test, "long int ");
|
||||
}
|
||||
if (type == 3) {
|
||||
Test<long long int>(loop, test, "long long int ");
|
||||
}
|
||||
if (type == 4) {
|
||||
Test<unsigned int>(loop, test, "unsigned int ");
|
||||
}
|
||||
if (type == 5) {
|
||||
Test<unsigned long int>(loop, test, "unsigned long int ");
|
||||
}
|
||||
if (type == 6) {
|
||||
Test<unsigned long long int>(loop, test, "unsigned long long int ");
|
||||
}
|
||||
if (type == 10) {
|
||||
// Test<float>(loop,test,"float ");
|
||||
}
|
||||
if (type == 11) {
|
||||
Test<double>(loop, test, "double ");
|
||||
}
|
||||
if (!all_tests)
|
||||
type = 100;
|
||||
else
|
||||
type++;
|
||||
}
|
||||
|
||||
Kokkos::finalize();
|
||||
}
|
||||
BENCHMARK(Test_Atomic<int>)->Arg(LOOP)->Iterations(10);
|
||||
BENCHMARK(Test_Atomic<long int>)->Arg(LOOP)->Iterations(10);
|
||||
BENCHMARK(Test_Atomic<long long int>)->Arg(LOOP)->Iterations(10);
|
||||
BENCHMARK(Test_Atomic<unsigned int>)->Arg(LOOP)->Iterations(10);
|
||||
BENCHMARK(Test_Atomic<unsigned long int>)->Arg(LOOP)->Iterations(10);
|
||||
BENCHMARK(Test_Atomic<unsigned long long int>)->Arg(LOOP)->Iterations(10);
|
||||
BENCHMARK(Test_Atomic<float>)->Arg(LOOP)->Iterations(10);
|
||||
BENCHMARK(Test_Atomic<double>)->Arg(LOOP)->Iterations(10);
|
||||
BENCHMARK(Test_Atomic<int>)->Arg(LOOP)->Iterations(10);
|
||||
|
||||
@ -21,240 +21,536 @@
|
||||
// core/src/libkokkoscore.a -ldl && OMP_NUM_THREADS=1
|
||||
// ./test_atomic_minmax_simple.x 10000000
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <iostream>
|
||||
#include <typeinfo>
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include "Benchmark_Context.hpp"
|
||||
#include "PerfTest_Category.hpp"
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_Timer.hpp>
|
||||
|
||||
using exec_space = Kokkos::DefaultExecutionSpace;
|
||||
|
||||
constexpr int LENGTH = 1'000'000;
|
||||
|
||||
template <typename T>
|
||||
void test(const int length) {
|
||||
Kokkos::View<T*, exec_space> prepare_input(const int length, const T value) {
|
||||
Kokkos::View<T*, exec_space> input("input", length);
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) { input(i) = value; });
|
||||
Kokkos::fence();
|
||||
return input;
|
||||
}
|
||||
|
||||
int get_length(benchmark::State& state) {
|
||||
return (Test::command_line_num_args() == 2)
|
||||
? std::stoi(Test::command_line_arg(1))
|
||||
: state.range(0);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int check_errors_replacement(Kokkos::View<T*, exec_space> view) {
|
||||
int errors = 0;
|
||||
Kokkos::parallel_reduce(
|
||||
view.size(),
|
||||
KOKKOS_LAMBDA(const int i, int& inner) { inner += (view(i) != (T)i); },
|
||||
errors);
|
||||
Kokkos::fence();
|
||||
return errors;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
double atomic_min_replacement(Kokkos::View<T*, exec_space> input) {
|
||||
const int length = input.size();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) {
|
||||
(void)Kokkos::atomic_fetch_min(&(input(i)), (T)i);
|
||||
});
|
||||
Kokkos::fence();
|
||||
return timer.seconds();
|
||||
}
|
||||
|
||||
using vector = Kokkos::View<T*, exec_space>;
|
||||
template <typename T>
|
||||
static void Atomic_MinReplacements(benchmark::State& state) {
|
||||
const int length = get_length(state);
|
||||
auto inp = prepare_input(length, std::numeric_limits<T>::max());
|
||||
|
||||
vector inp("input", length);
|
||||
T max = std::numeric_limits<T>::max();
|
||||
T min = std::numeric_limits<T>::lowest();
|
||||
for (auto _ : state) {
|
||||
const auto time = atomic_min_replacement(inp);
|
||||
const auto errors = check_errors_replacement(inp);
|
||||
|
||||
// input is max values - all min atomics will replace
|
||||
{
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) { inp(i) = max; });
|
||||
Kokkos::fence();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) {
|
||||
(void)Kokkos::atomic_fetch_min(&(inp(i)), (T)i);
|
||||
});
|
||||
Kokkos::fence();
|
||||
double time = timer.seconds();
|
||||
|
||||
int errors(0);
|
||||
Kokkos::parallel_reduce(
|
||||
length,
|
||||
KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); },
|
||||
errors);
|
||||
Kokkos::fence();
|
||||
|
||||
if (errors) {
|
||||
std::cerr << "Error in 100% min replacements: " << errors << std::endl;
|
||||
std::cerr << "inp(0)=" << inp(0) << std::endl;
|
||||
// report results
|
||||
state.SetIterationTime(time);
|
||||
if (errors > 0) {
|
||||
state.counters["Errors"] = benchmark::Counter(errors);
|
||||
}
|
||||
std::cout << "Time for 100% min replacements: " << time << std::endl;
|
||||
}
|
||||
|
||||
// input is min values - all max atomics will replace
|
||||
{
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) { inp(i) = min; });
|
||||
Kokkos::fence();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) {
|
||||
(void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i);
|
||||
});
|
||||
Kokkos::fence();
|
||||
double time = timer.seconds();
|
||||
|
||||
int errors(0);
|
||||
Kokkos::parallel_reduce(
|
||||
length,
|
||||
KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); },
|
||||
errors);
|
||||
Kokkos::fence();
|
||||
|
||||
if (errors) {
|
||||
std::cerr << "Error in 100% max replacements: " << errors << std::endl;
|
||||
std::cerr << "inp(0)=" << inp(0) << std::endl;
|
||||
}
|
||||
std::cout << "Time for 100% max replacements: " << time << std::endl;
|
||||
}
|
||||
|
||||
// input is max values - all max atomics will early exit
|
||||
{
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) { inp(i) = max; });
|
||||
Kokkos::fence();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) {
|
||||
(void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i);
|
||||
});
|
||||
Kokkos::fence();
|
||||
double time = timer.seconds();
|
||||
|
||||
int errors(0);
|
||||
Kokkos::parallel_reduce(
|
||||
length,
|
||||
KOKKOS_LAMBDA(const int i, int& inner) {
|
||||
T ref = max;
|
||||
inner += (inp(i) != ref);
|
||||
},
|
||||
errors);
|
||||
Kokkos::fence();
|
||||
|
||||
if (errors) {
|
||||
std::cerr << "Error in 100% max early exits: " << errors << std::endl;
|
||||
std::cerr << "inp(0)=" << inp(0) << std::endl;
|
||||
}
|
||||
std::cout << "Time for 100% max early exits: " << time << std::endl;
|
||||
}
|
||||
|
||||
// input is min values - all min atomics will early exit
|
||||
{
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) { inp(i) = min; });
|
||||
Kokkos::fence();
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) {
|
||||
(void)Kokkos::atomic_min_fetch(&(inp(i)), (T)i);
|
||||
});
|
||||
Kokkos::fence();
|
||||
double time = timer.seconds();
|
||||
|
||||
int errors(0);
|
||||
Kokkos::parallel_reduce(
|
||||
length,
|
||||
KOKKOS_LAMBDA(const int i, int& inner) {
|
||||
T ref = min;
|
||||
inner += (inp(i) != ref);
|
||||
},
|
||||
errors);
|
||||
Kokkos::fence();
|
||||
|
||||
if (errors) {
|
||||
std::cerr << "Error in 100% min early exits: " << errors << std::endl;
|
||||
std::cerr << "inp(0)=" << inp(0) << std::endl;
|
||||
if (length > 9) std::cout << "inp(9)=" << inp(9) << std::endl;
|
||||
}
|
||||
std::cout << "Time for 100% min early exits: " << time << std::endl;
|
||||
}
|
||||
|
||||
// limit iterations for contentious test, takes ~50x longer for same length
|
||||
auto con_length = length / 5;
|
||||
// input is min values - some max atomics will replace
|
||||
{
|
||||
Kokkos::parallel_for(
|
||||
1, KOKKOS_LAMBDA(const int i) { inp(i) = min; });
|
||||
Kokkos::fence();
|
||||
|
||||
T current(0);
|
||||
timer.reset();
|
||||
Kokkos::parallel_reduce(
|
||||
con_length,
|
||||
KOKKOS_LAMBDA(const int i, T& inner) {
|
||||
inner = Kokkos::atomic_max_fetch(&(inp(0)), inner + 1);
|
||||
if (i == con_length - 1) {
|
||||
Kokkos::atomic_max_fetch(&(inp(0)), max);
|
||||
inner = max;
|
||||
}
|
||||
},
|
||||
Kokkos::Max<T>(current));
|
||||
Kokkos::fence();
|
||||
double time = timer.seconds();
|
||||
|
||||
if (current < max) {
|
||||
std::cerr << "Error in contentious max replacements: " << std::endl;
|
||||
std::cerr << "final=" << current << " inp(0)=" << inp(0) << " max=" << max
|
||||
<< std::endl;
|
||||
}
|
||||
std::cout << "Time for contentious max " << con_length
|
||||
<< " replacements: " << time << std::endl;
|
||||
}
|
||||
|
||||
// input is max values - some min atomics will replace
|
||||
{
|
||||
Kokkos::parallel_for(
|
||||
1, KOKKOS_LAMBDA(const int i) { inp(i) = max; });
|
||||
Kokkos::fence();
|
||||
|
||||
timer.reset();
|
||||
T current(100000000);
|
||||
Kokkos::parallel_reduce(
|
||||
con_length,
|
||||
KOKKOS_LAMBDA(const int i, T& inner) {
|
||||
inner = Kokkos::atomic_min_fetch(&(inp(0)), inner - 1);
|
||||
if (i == con_length - 1) {
|
||||
Kokkos::atomic_min_fetch(&(inp(0)), min);
|
||||
inner = min;
|
||||
}
|
||||
},
|
||||
Kokkos::Min<T>(current));
|
||||
Kokkos::fence();
|
||||
double time = timer.seconds();
|
||||
|
||||
if (current > min) {
|
||||
std::cerr << "Error in contentious min replacements: " << std::endl;
|
||||
std::cerr << "final=" << current << " inp(0)=" << inp(0) << " min=" << min
|
||||
<< std::endl;
|
||||
}
|
||||
std::cout << "Time for contentious min " << con_length
|
||||
<< " replacements: " << time << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
Kokkos::initialize(argc, argv);
|
||||
{
|
||||
int length = 1000000;
|
||||
if (argc == 2) {
|
||||
length = std::stoi(argv[1]);
|
||||
}
|
||||
|
||||
if (length < 1) {
|
||||
throw std::invalid_argument("");
|
||||
}
|
||||
|
||||
std::cout << "================ int" << std::endl;
|
||||
test<int>(length);
|
||||
std::cout << "================ long" << std::endl;
|
||||
test<long>(length);
|
||||
std::cout << "================ long long" << std::endl;
|
||||
test<long long>(length);
|
||||
|
||||
std::cout << "================ unsigned int" << std::endl;
|
||||
test<unsigned int>(length);
|
||||
std::cout << "================ unsigned long" << std::endl;
|
||||
test<unsigned long>(length);
|
||||
std::cout << "================ unsigned long long" << std::endl;
|
||||
test<unsigned long long>(length);
|
||||
|
||||
std::cout << "================ float" << std::endl;
|
||||
test<float>(length);
|
||||
std::cout << "================ double" << std::endl;
|
||||
test<double>(length);
|
||||
}
|
||||
Kokkos::finalize();
|
||||
return 0;
|
||||
template <typename T>
|
||||
double atomic_max_replacement(Kokkos::View<T*, exec_space> input) {
|
||||
const int length = input.size();
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for(
|
||||
length, KOKKOS_LAMBDA(const int i) {
|
||||
(void)Kokkos::atomic_max_fetch(&(input(i)), (T)i);
|
||||
});
|
||||
Kokkos::fence();
|
||||
return timer.seconds();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void Atomic_MaxReplacements(benchmark::State& state) {
|
||||
const auto length = get_length(state);
|
||||
auto inp = prepare_input(length, std::numeric_limits<T>::lowest());
|
||||
|
||||
for (auto _ : state) {
|
||||
const auto time = atomic_max_replacement(inp);
|
||||
const auto errors = check_errors_replacement(inp);
|
||||
|
||||
// report results
|
||||
state.SetIterationTime(time);
|
||||
if (errors > 0) {
|
||||
state.counters["Errors"] = benchmark::Counter(errors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int check_errors_early_exit(Kokkos::View<T*, exec_space> view, const T ref) {
|
||||
int errors = 0;
|
||||
Kokkos::parallel_reduce(
|
||||
view.size(),
|
||||
KOKKOS_LAMBDA(const int i, int& inner) { inner += (view(i) != ref); },
|
||||
errors);
|
||||
Kokkos::fence();
|
||||
return errors;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void Atomic_MaxEarlyExits(benchmark::State& state) {
|
||||
const auto length = get_length(state);
|
||||
auto inp = prepare_input(length, std::numeric_limits<T>::max());
|
||||
|
||||
for (auto _ : state) {
|
||||
const auto time = atomic_max_replacement(inp);
|
||||
const auto errors =
|
||||
check_errors_early_exit(inp, std::numeric_limits<T>::max());
|
||||
|
||||
// report results
|
||||
state.SetIterationTime(time);
|
||||
if (errors > 0) {
|
||||
state.counters["Errors"] = benchmark::Counter(errors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void Atomic_MinEarlyExits(benchmark::State& state) {
|
||||
const auto length = get_length(state);
|
||||
auto inp = prepare_input(length, std::numeric_limits<T>::lowest());
|
||||
|
||||
for (auto _ : state) {
|
||||
const auto time = atomic_min_replacement(inp);
|
||||
const auto errors =
|
||||
check_errors_early_exit(inp, std::numeric_limits<T>::lowest());
|
||||
|
||||
// report results
|
||||
state.SetIterationTime(time);
|
||||
if (errors > 0) {
|
||||
state.counters["Errors"] = benchmark::Counter(errors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void report_errors_contentious_replacement(benchmark::State& state,
|
||||
const T final, const T first,
|
||||
const T expected) {
|
||||
state.counters["Errors"] = benchmark::Counter(1);
|
||||
state.counters["Final"] = benchmark::Counter(final);
|
||||
state.counters["First"] = benchmark::Counter(first);
|
||||
state.counters["Expected"] = benchmark::Counter(expected);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
double atomic_contentious_max_replacement(benchmark::State& state,
|
||||
Kokkos::View<T*, exec_space> input,
|
||||
const int con_length) {
|
||||
const auto max = std::numeric_limits<T>::max();
|
||||
T current = 0;
|
||||
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_reduce(
|
||||
con_length,
|
||||
KOKKOS_LAMBDA(const int i, T& inner) {
|
||||
inner = Kokkos::atomic_max_fetch(&(input(0)), inner + 1);
|
||||
if (i == con_length - 1) {
|
||||
Kokkos::atomic_max_fetch(&(input(0)), max);
|
||||
inner = max;
|
||||
}
|
||||
},
|
||||
Kokkos::Max<T>(current));
|
||||
Kokkos::fence();
|
||||
const auto time = timer.seconds();
|
||||
|
||||
if (current < max) {
|
||||
report_errors_contentious_replacement(state, current, input(0), max);
|
||||
}
|
||||
|
||||
return time;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void Atomic_ContentiousMaxReplacements(benchmark::State& state) {
|
||||
const auto length = get_length(state);
|
||||
auto inp = prepare_input(1, std::numeric_limits<T>::lowest());
|
||||
|
||||
for (auto _ : state) {
|
||||
const auto time = atomic_contentious_max_replacement(state, inp, length);
|
||||
|
||||
state.SetIterationTime(time);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
double atomic_contentious_min_replacement(benchmark::State& state,
|
||||
Kokkos::View<T*, exec_space> input,
|
||||
const int con_length) {
|
||||
const auto min = std::numeric_limits<T>::lowest();
|
||||
T current = 0;
|
||||
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_reduce(
|
||||
con_length,
|
||||
KOKKOS_LAMBDA(const int i, T& inner) {
|
||||
inner = Kokkos::atomic_min_fetch(&(input(0)), inner - 1);
|
||||
if (i == con_length - 1) {
|
||||
Kokkos::atomic_min_fetch(&(input(0)), min);
|
||||
inner = min;
|
||||
}
|
||||
},
|
||||
Kokkos::Min<T>(current));
|
||||
Kokkos::fence();
|
||||
const auto time = timer.seconds();
|
||||
|
||||
if (current > min) {
|
||||
report_errors_contentious_replacement(state, current, input(0), min);
|
||||
}
|
||||
|
||||
return time;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void Atomic_ContentiousMinReplacements(benchmark::State& state) {
|
||||
const auto length = get_length(state);
|
||||
auto inp = prepare_input(1, std::numeric_limits<T>::max());
|
||||
|
||||
for (auto _ : state) {
|
||||
const auto time = atomic_contentious_max_replacement(state, inp, length);
|
||||
|
||||
state.SetIterationTime(time);
|
||||
}
|
||||
}
|
||||
|
||||
// int
|
||||
BENCHMARK(Atomic_MinReplacements<int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxReplacements<int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxEarlyExits<int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MinEarlyExits<int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMaxReplacements<int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMinReplacements<int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
// long
|
||||
BENCHMARK(Atomic_MinReplacements<long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxReplacements<long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxEarlyExits<long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MinEarlyExits<long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMaxReplacements<long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMinReplacements<long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
// long long
|
||||
BENCHMARK(Atomic_MinReplacements<long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxReplacements<long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxEarlyExits<long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MinEarlyExits<long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMaxReplacements<long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMinReplacements<long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
// unsigned int
|
||||
BENCHMARK(Atomic_MinReplacements<unsigned int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxReplacements<unsigned int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxEarlyExits<unsigned int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MinEarlyExits<unsigned int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMaxReplacements<unsigned int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMinReplacements<unsigned int>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
// unsigned long
|
||||
BENCHMARK(Atomic_MinReplacements<unsigned long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxReplacements<unsigned long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxEarlyExits<unsigned long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MinEarlyExits<unsigned long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMaxReplacements<unsigned long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMinReplacements<unsigned long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
// unsigned long long
|
||||
BENCHMARK(Atomic_MinReplacements<unsigned long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxReplacements<unsigned long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxEarlyExits<unsigned long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MinEarlyExits<unsigned long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMaxReplacements<unsigned long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMinReplacements<unsigned long long>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
// float
|
||||
BENCHMARK(Atomic_MinReplacements<float>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxReplacements<float>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxEarlyExits<float>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MinEarlyExits<float>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMaxReplacements<float>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMinReplacements<float>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
// double
|
||||
BENCHMARK(Atomic_MinReplacements<double>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxReplacements<double>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MaxEarlyExits<double>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_MinEarlyExits<double>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMaxReplacements<double>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
BENCHMARK(Atomic_ContentiousMinReplacements<double>)
|
||||
->ArgName("Length")
|
||||
->Arg(LENGTH / 5)
|
||||
->UseManualTime()
|
||||
->Iterations(10);
|
||||
|
||||
@ -19,9 +19,13 @@
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_Timer.hpp>
|
||||
|
||||
#include "Benchmark_Context.hpp"
|
||||
#include "PerfTest_Category.hpp"
|
||||
|
||||
using ExecSpace = Kokkos::DefaultExecutionSpace;
|
||||
using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
|
||||
|
||||
@ -146,53 +150,8 @@ struct TestFunctor {
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
static const char help_flag[] = "--help";
|
||||
static const char alloc_size_flag[] = "--alloc_size=";
|
||||
static const char super_size_flag[] = "--super_size=";
|
||||
static const char chunk_span_flag[] = "--chunk_span=";
|
||||
static const char fill_stride_flag[] = "--fill_stride=";
|
||||
static const char fill_level_flag[] = "--fill_level=";
|
||||
static const char repeat_outer_flag[] = "--repeat_outer=";
|
||||
static const char repeat_inner_flag[] = "--repeat_inner=";
|
||||
|
||||
long total_alloc_size = 1000000;
|
||||
int min_superblock_size = 10000;
|
||||
int chunk_span = 5;
|
||||
int fill_stride = 1;
|
||||
int fill_level = 70;
|
||||
int repeat_outer = 1;
|
||||
int repeat_inner = 1;
|
||||
|
||||
int ask_help = 0;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
const char* const a = argv[i];
|
||||
|
||||
if (!strncmp(a, help_flag, strlen(help_flag))) ask_help = 1;
|
||||
|
||||
if (!strncmp(a, alloc_size_flag, strlen(alloc_size_flag)))
|
||||
total_alloc_size = atol(a + strlen(alloc_size_flag));
|
||||
|
||||
if (!strncmp(a, super_size_flag, strlen(super_size_flag)))
|
||||
min_superblock_size = std::stoi(a + strlen(super_size_flag));
|
||||
|
||||
if (!strncmp(a, fill_stride_flag, strlen(fill_stride_flag)))
|
||||
fill_stride = std::stoi(a + strlen(fill_stride_flag));
|
||||
|
||||
if (!strncmp(a, fill_level_flag, strlen(fill_level_flag)))
|
||||
fill_level = std::stoi(a + strlen(fill_level_flag));
|
||||
|
||||
if (!strncmp(a, chunk_span_flag, strlen(chunk_span_flag)))
|
||||
chunk_span = std::stoi(a + strlen(chunk_span_flag));
|
||||
|
||||
if (!strncmp(a, repeat_outer_flag, strlen(repeat_outer_flag)))
|
||||
repeat_outer = std::stoi(a + strlen(repeat_outer_flag));
|
||||
|
||||
if (!strncmp(a, repeat_inner_flag, strlen(repeat_inner_flag)))
|
||||
repeat_inner = std::stoi(a + strlen(repeat_inner_flag));
|
||||
}
|
||||
|
||||
int get_number_alloc(int chunk_span, int min_superblock_size,
|
||||
long total_alloc_size, int fill_level) {
|
||||
int chunk_span_bytes = 0;
|
||||
for (int i = 0; i < chunk_span; ++i) {
|
||||
auto chunk_bytes = TestFunctor::chunk * (1 + i);
|
||||
@ -212,81 +171,85 @@ int main(int argc, char* argv[]) {
|
||||
auto bytes_wanted = (actual_total_bytes * fill_level) / 100;
|
||||
auto chunk_spans = bytes_wanted / chunk_span_bytes;
|
||||
auto number_alloc = int(chunk_spans * chunk_span);
|
||||
return number_alloc;
|
||||
}
|
||||
|
||||
if (ask_help) {
|
||||
std::cout << "command line options:"
|
||||
<< " " << help_flag << " " << alloc_size_flag << "##"
|
||||
<< " " << super_size_flag << "##"
|
||||
<< " " << fill_stride_flag << "##"
|
||||
<< " " << fill_level_flag << "##"
|
||||
<< " " << chunk_span_flag << "##"
|
||||
<< " " << repeat_outer_flag << "##"
|
||||
<< " " << repeat_inner_flag << "##" << std::endl;
|
||||
return 0;
|
||||
template <class T>
|
||||
T get_parameter(const char flag[], T default_value) {
|
||||
auto argc = Test::command_line_num_args();
|
||||
auto value = default_value;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
const char* const a = Test::command_line_arg(i);
|
||||
|
||||
if (!strncmp(a, flag, strlen(flag))) value = std::stoi(a + strlen(flag));
|
||||
}
|
||||
|
||||
Kokkos::initialize(argc, argv);
|
||||
return value;
|
||||
}
|
||||
|
||||
double sum_fill_time = 0;
|
||||
double sum_cycle_time = 0;
|
||||
double sum_both_time = 0;
|
||||
double min_fill_time = std::numeric_limits<double>::max();
|
||||
double min_cycle_time = std::numeric_limits<double>::max();
|
||||
double min_both_time = std::numeric_limits<double>::max();
|
||||
// one alloc in fill, alloc/dealloc pair in repeat_inner
|
||||
for (int i = 0; i < repeat_outer; ++i) {
|
||||
static void Mempool_Fill(benchmark::State& state) {
|
||||
long total_alloc_size =
|
||||
get_parameter("--alloc_size=", static_cast<long>(state.range(0)));
|
||||
int min_superblock_size = get_parameter("--super_size=", state.range(1));
|
||||
int chunk_span = get_parameter("--chunk_span=", state.range(2));
|
||||
int fill_stride = get_parameter("--fill_stride=", state.range(3));
|
||||
int fill_level = get_parameter("--fill_level=", state.range(4));
|
||||
int repeat_inner = get_parameter("--repeat_inner=", state.range(5));
|
||||
int number_alloc = get_number_alloc(chunk_span, min_superblock_size,
|
||||
total_alloc_size, fill_level);
|
||||
|
||||
for (auto _ : state) {
|
||||
TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
|
||||
fill_stride, chunk_span, repeat_inner);
|
||||
|
||||
Kokkos::Timer timer;
|
||||
|
||||
if (!functor.test_fill()) {
|
||||
Kokkos::abort("fill ");
|
||||
}
|
||||
|
||||
auto t0 = timer.seconds();
|
||||
state.SetIterationTime(timer.seconds());
|
||||
state.counters[KokkosBenchmark::benchmark_fom("fill ops per second")] =
|
||||
benchmark::Counter(number_alloc,
|
||||
benchmark::Counter::kIsIterationInvariantRate);
|
||||
}
|
||||
}
|
||||
|
||||
static void Mempool_Alloc_Dealloc(benchmark::State& state) {
|
||||
long total_alloc_size =
|
||||
get_parameter("--alloc_size=", static_cast<long>(state.range(0)));
|
||||
int min_superblock_size = get_parameter("--super_size=", state.range(1));
|
||||
int chunk_span = get_parameter("--chunk_span=", state.range(2));
|
||||
int fill_stride = get_parameter("--fill_stride=", state.range(3));
|
||||
int fill_level = get_parameter("--fill_level=", state.range(4));
|
||||
int repeat_inner = get_parameter("--repeat_inner=", state.range(5));
|
||||
int number_alloc = get_number_alloc(chunk_span, min_superblock_size,
|
||||
total_alloc_size, fill_level);
|
||||
|
||||
for (auto _ : state) {
|
||||
TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
|
||||
fill_stride, chunk_span, repeat_inner);
|
||||
Kokkos::Timer timer;
|
||||
|
||||
if (!functor.test_alloc_dealloc()) {
|
||||
Kokkos::abort("alloc/dealloc ");
|
||||
}
|
||||
|
||||
auto t1 = timer.seconds();
|
||||
auto this_fill_time = t0;
|
||||
auto this_cycle_time = t1 - t0;
|
||||
auto this_both_time = t1;
|
||||
sum_fill_time += this_fill_time;
|
||||
sum_cycle_time += this_cycle_time;
|
||||
sum_both_time += this_both_time;
|
||||
min_fill_time = std::min(min_fill_time, this_fill_time);
|
||||
min_cycle_time = std::min(min_cycle_time, this_cycle_time);
|
||||
min_both_time = std::min(min_both_time, this_both_time);
|
||||
state.SetIterationTime(timer.seconds());
|
||||
state.counters[KokkosBenchmark::benchmark_fom("cycle ops per second")] =
|
||||
benchmark::Counter(2 * number_alloc * repeat_inner,
|
||||
benchmark::Counter::kIsIterationInvariantRate);
|
||||
}
|
||||
|
||||
Kokkos::finalize();
|
||||
|
||||
printf(
|
||||
"\"mempool: alloc super stride level span inner outer number\" %ld %d %d "
|
||||
"%d %d %d %d %d\n",
|
||||
total_alloc_size, min_superblock_size, fill_stride, fill_level,
|
||||
chunk_span, repeat_inner, repeat_outer, number_alloc);
|
||||
|
||||
auto avg_fill_time = sum_fill_time / repeat_outer;
|
||||
auto avg_cycle_time = sum_cycle_time / repeat_outer;
|
||||
auto avg_both_time = sum_both_time / repeat_outer;
|
||||
|
||||
printf("\"mempool: fill time (min, avg)\" %.8f %.8f\n", min_fill_time,
|
||||
avg_fill_time);
|
||||
|
||||
printf("\"mempool: cycle time (min, avg)\" %.8f %.8f\n", min_cycle_time,
|
||||
avg_cycle_time);
|
||||
|
||||
printf("\"mempool: test time (min, avg)\" %.8f %.8f\n", min_both_time,
|
||||
avg_both_time);
|
||||
|
||||
printf("\"mempool: fill ops per second (max, avg)\" %g %g\n",
|
||||
number_alloc / min_fill_time, number_alloc / avg_fill_time);
|
||||
|
||||
printf("\"mempool: cycle ops per second (max, avg)\" %g %g\n",
|
||||
(2 * number_alloc * repeat_inner) / min_cycle_time,
|
||||
(2 * number_alloc * repeat_inner) / avg_cycle_time);
|
||||
}
|
||||
|
||||
const std::vector<std::string> ARG_NAMES = {
|
||||
"total_alloc_size", "min_superblock_size", "chunk_span",
|
||||
"fill_stride", "fill_level", "repeat_inner"};
|
||||
const std::vector<int64_t> ARGS = {1'000'000, 10'000, 5, 1, 70, 1};
|
||||
|
||||
BENCHMARK(Mempool_Fill)->ArgNames(ARG_NAMES)->Args(ARGS)->UseManualTime();
|
||||
|
||||
BENCHMARK(Mempool_Alloc_Dealloc)
|
||||
->ArgNames(ARG_NAMES)
|
||||
->Args(ARGS)
|
||||
->UseManualTime();
|
||||
|
||||
Reference in New Issue
Block a user