Update Kokkos library in LAMMPS to v4.1.0

This commit is contained in:
Stan Gerald Moore
2023-06-29 10:42:42 -06:00
parent 170173a213
commit 330107b77b
480 changed files with 24051 additions and 23393 deletions

View File

@ -16,15 +16,20 @@
#include <benchmark/benchmark.h>
#include <Benchmark_Context.hpp>
#include "Benchmark_Context.hpp"
#include <Kokkos_Core.hpp>
#include "PerfTest_Category.hpp"
int main(int argc, char** argv) {
Kokkos::initialize(argc, argv);
benchmark::Initialize(&argc, argv);
benchmark::SetDefaultTimeUnit(benchmark::kSecond);
KokkosBenchmark::add_benchmark_context(true);
(void)Test::command_line_num_args(argc);
(void)Test::command_line_arg(0, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();

View File

@ -0,0 +1,81 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#include "Benchmark_Context.hpp"
namespace KokkosBenchmark {
/**
* \brief Remove unwanted spaces and colon signs from input string. In case of
* invalid input it will return an empty string.
*/
std::string remove_unwanted_characters(const std::string& str) {
auto from = str.find_first_not_of(" :");
auto to = str.find_last_not_of(" :");
if (from == std::string::npos || to == std::string::npos) {
return "";
}
// return extracted part of string without unwanted spaces and colon signs
return str.substr(from, to + 1);
}
/**
* \brief Extract all key:value pairs from kokkos configuration and add it to
* the benchmark context
*/
void add_kokkos_configuration(bool verbose) {
std::ostringstream msg;
Kokkos::print_configuration(msg, verbose);
// Iterate over lines returned from kokkos and extract key:value pairs
std::stringstream ss{msg.str()};
for (std::string line; std::getline(ss, line, '\n');) {
auto found = line.find_first_of(':');
if (found != std::string::npos) {
auto val = remove_unwanted_characters(line.substr(found + 1));
// Ignore line without value, for example a category name
if (!val.empty()) {
benchmark::AddCustomContext(
remove_unwanted_characters(line.substr(0, found)), val);
}
}
}
}
void add_git_info() {
if (!Kokkos::Impl::GIT_BRANCH.empty()) {
benchmark::AddCustomContext("GIT_BRANCH", Kokkos::Impl::GIT_BRANCH);
benchmark::AddCustomContext("GIT_COMMIT_HASH",
Kokkos::Impl::GIT_COMMIT_HASH);
benchmark::AddCustomContext("GIT_CLEAN_STATUS",
Kokkos::Impl::GIT_CLEAN_STATUS);
benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION",
Kokkos::Impl::GIT_COMMIT_DESCRIPTION);
benchmark::AddCustomContext("GIT_COMMIT_DATE",
Kokkos::Impl::GIT_COMMIT_DATE);
}
}
void add_benchmark_context(bool verbose) {
// Add Kokkos configuration to benchmark context data
add_kokkos_configuration(verbose);
// Add git information to benchmark context data
add_git_info();
}
} // namespace KokkosBenchmark

View File

@ -26,62 +26,34 @@
namespace KokkosBenchmark {
/// \brief Remove unwanted spaces and colon signs from input string. In case of
/// invalid input it will return an empty string.
std::string remove_unwanted_characters(std::string str) {
auto from = str.find_first_not_of(" :");
auto to = str.find_last_not_of(" :");
/**
* \brief Gather all context information and add it to benchmark context data
*/
void add_benchmark_context(bool verbose = false);
if (from == std::string::npos || to == std::string::npos) {
return "";
}
// return extracted part of string without unwanted spaces and colon signs
return str.substr(from, to + 1);
/**
* \brief Mark the label as a figure of merit.
*/
inline std::string benchmark_fom(const std::string& label) {
return "FOM: " + label;
}
/// \brief Extract all key:value pairs from kokkos configuration and add it to
/// the benchmark context
void add_kokkos_configuration(bool verbose) {
std::ostringstream msg;
Kokkos::print_configuration(msg, verbose);
/**
* \brief Report throughput and amount of data processed for simple View
* operations
*/
template <class ViewType>
void report_results(benchmark::State& state, ViewType view, int data_ratio,
double time) {
// data processed in megabytes
const double data_processed = data_ratio * view.size() *
sizeof(typename ViewType::value_type) /
1'000'000;
// Iterate over lines returned from kokkos and extract key:value pairs
std::stringstream ss{msg.str()};
for (std::string line; std::getline(ss, line, '\n');) {
auto found = line.find_first_of(':');
if (found != std::string::npos) {
auto val = remove_unwanted_characters(line.substr(found + 1));
// Ignore line without value, for example a category name
if (!val.empty()) {
benchmark::AddCustomContext(
remove_unwanted_characters(line.substr(0, found)), val);
}
}
}
}
/// \brief Add all data related to git to benchmark context
void add_git_info() {
if (!Kokkos::Impl::GIT_BRANCH.empty()) {
benchmark::AddCustomContext("GIT_BRANCH", Kokkos::Impl::GIT_BRANCH);
benchmark::AddCustomContext("GIT_COMMIT_HASH",
Kokkos::Impl::GIT_COMMIT_HASH);
benchmark::AddCustomContext("GIT_CLEAN_STATUS",
Kokkos::Impl::GIT_CLEAN_STATUS);
benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION",
Kokkos::Impl::GIT_COMMIT_DESCRIPTION);
benchmark::AddCustomContext("GIT_COMMIT_DATE",
Kokkos::Impl::GIT_COMMIT_DATE);
}
}
/// \brief Gather all context information and add it to benchmark context data
void add_benchmark_context(bool verbose = false) {
// Add Kokkos configuration to benchmark context data
add_kokkos_configuration(verbose);
// Add git information to benchmark context data
add_git_info();
state.SetIterationTime(time);
state.counters["MB"] = benchmark::Counter(data_processed);
state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
data_processed / 1'000, benchmark::Counter::kIsIterationInvariantRate);
}
} // namespace KokkosBenchmark

View File

@ -1,108 +1,31 @@
#INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
#INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
# warning: PerfTest_CustomReduction.cpp uses
# ../../algorithms/src/Kokkos_Random.hpp
# we'll just allow it to be included, but note
# that in TriBITS KokkosAlgorithms can be disabled...
#INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
# FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests.
# FIXME_OPENACC - temporarily disabled due to unimplemented features
IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
RETURN()
ENDIF()
# all PerformanceTest_* executables are part of regular tests
# TODO: finish converting these into benchmarks (in progress)
IF(KOKKOS_ENABLE_TESTS)
IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL)
KOKKOS_ADD_EXECUTABLE (
PerformanceTest_SharedSpace
SOURCES test_sharedSpace.cpp
)
ENDIF()
SET(SOURCES
PerfTestMain.cpp
PerfTestGramSchmidt.cpp
PerfTestHexGrad.cpp
PerfTest_CustomReduction.cpp
PerfTest_ExecSpacePartitioning.cpp
PerfTest_ViewAllocate.cpp
PerfTest_ViewFill_123.cpp
PerfTest_ViewFill_45.cpp
PerfTest_ViewFill_6.cpp
PerfTest_ViewFill_7.cpp
PerfTest_ViewFill_8.cpp
PerfTest_ViewResize_123.cpp
PerfTest_ViewResize_45.cpp
PerfTest_ViewResize_6.cpp
PerfTest_ViewResize_7.cpp
PerfTest_ViewResize_8.cpp
)
KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
IF(Kokkos_ENABLE_OPENMPTARGET)
# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction
LIST(REMOVE_ITEM SOURCES
PerfTestGramSchmidt.cpp
PerfTest_CustomReduction.cpp
PerfTest_ExecSpacePartitioning.cpp
)
IF(NOT Kokkos_ENABLE_OPENMPTARGET)
# FIXME OPENMPTARGET needs tasking
KOKKOS_ADD_EXECUTABLE_AND_TEST(
PerformanceTest_TaskDag
SOURCES test_taskdag.cpp
CATEGORIES PERFORMANCE
)
ENDIF()
ENDIF()
IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL)
KOKKOS_ADD_EXECUTABLE (
PerformanceTest_SharedSpace
SOURCES test_sharedSpace.cpp
)
ENDIF()
# Per #374, we always want to build this test, but we only want to run
# it as a PERFORMANCE test. That's why we separate building the test
# from running the test.
#leave these as basic includes for now
#I don't need anything transitive
KOKKOS_INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
# This test currently times out for MSVC
IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
KOKKOS_ADD_EXECUTABLE_AND_TEST(
PerfTestExec
SOURCES ${SOURCES}
CATEGORIES PERFORMANCE
)
ENDIF()
KOKKOS_ADD_EXECUTABLE_AND_TEST(
PerformanceTest_Atomic
SOURCES test_atomic.cpp
CATEGORIES PERFORMANCE
)
IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA)
KOKKOS_ADD_EXECUTABLE_AND_TEST(
PerformanceTest_Atomic_MinMax
SOURCES test_atomic_minmax_simple.cpp
CATEGORIES PERFORMANCE
)
ENDIF()
# FIXME_NVHPC
IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
KOKKOS_ADD_EXECUTABLE_AND_TEST(
PerformanceTest_Mempool
SOURCES test_mempool.cpp
CATEGORIES PERFORMANCE
)
ENDIF()
IF(NOT Kokkos_ENABLE_OPENMPTARGET)
# FIXME OPENMPTARGET needs tasking
KOKKOS_ADD_EXECUTABLE_AND_TEST(
PerformanceTest_TaskDag
SOURCES test_taskdag.cpp
CATEGORIES PERFORMANCE
)
ENDIF()
IF(NOT Kokkos_ENABLE_BENCHMARKS)
RETURN()
ENDIF()
@ -111,6 +34,7 @@ IF (KOKKOS_HAS_TRILINOS)
message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos")
ENDIF()
# Find or download google/benchmark library
find_package(benchmark QUIET)
IF(benchmark_FOUND)
MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}")
@ -119,7 +43,7 @@ ELSE()
include(FetchContent)
SET(BENCHMARK_ENABLE_TESTING OFF)
list(APPEND CMAKE_MESSAGE_INDENT " ")
list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ")
FetchContent_Declare(
googlebenchmark
URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz
@ -128,8 +52,6 @@ ELSE()
FetchContent_MakeAvailable(googlebenchmark)
list(POP_BACK CMAKE_MESSAGE_INDENT)
include_directories(${benchmark_SOURCE_DIR}/include)
# Suppress clang-tidy diagnostics on code that we do not have control over
IF(CMAKE_CXX_CLANG_TIDY)
SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "")
@ -157,6 +79,10 @@ FUNCTION(KOKKOS_ADD_BENCHMARK NAME)
ENDIF()
SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME})
LIST(APPEND BENCHMARK_SOURCES
BenchmarkMain.cpp
Benchmark_Context.cpp
)
ADD_EXECUTABLE(
${BENCHMARK_NAME}
@ -166,6 +92,11 @@ FUNCTION(KOKKOS_ADD_BENCHMARK NAME)
${BENCHMARK_NAME}
PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version
)
TARGET_INCLUDE_DIRECTORIES(
${BENCHMARK_NAME}
SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include
)
FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES})
SET_SOURCE_FILES_PROPERTIES(
${SOURCE_FILE}
@ -188,7 +119,11 @@ ENDFUNCTION()
SET(
BENCHMARK_SOURCES
BenchmarkMain.cpp
PerfTestGramSchmidt.cpp
PerfTest_CustomReduction.cpp
PerfTest_ExecSpacePartitioning.cpp
PerfTestHexGrad.cpp
PerfTest_ViewAllocate.cpp
PerfTest_ViewCopy_a123.cpp
PerfTest_ViewCopy_b123.cpp
PerfTest_ViewCopy_c123.cpp
@ -210,9 +145,50 @@ SET(
PerfTest_ViewCopy_c8.cpp
PerfTest_ViewCopy_d8.cpp
PerfTest_ViewCopy_Raw.cpp
PerfTest_ViewFill_123.cpp
PerfTest_ViewFill_45.cpp
PerfTest_ViewFill_6.cpp
PerfTest_ViewFill_7.cpp
PerfTest_ViewFill_8.cpp
PerfTest_ViewFill_Raw.cpp
PerfTest_ViewResize_123.cpp
PerfTest_ViewResize_45.cpp
PerfTest_ViewResize_6.cpp
PerfTest_ViewResize_7.cpp
PerfTest_ViewResize_8.cpp
PerfTest_ViewResize_Raw.cpp
)
IF(Kokkos_ENABLE_OPENMPTARGET)
# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction
LIST(REMOVE_ITEM BENCHMARK_SOURCES
PerfTestGramSchmidt.cpp
PerfTest_CustomReduction.cpp
PerfTest_ExecSpacePartitioning.cpp
)
ENDIF()
KOKKOS_ADD_BENCHMARK(
PerformanceTest_Benchmark
SOURCES ${BENCHMARK_SOURCES}
)
IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA)
KOKKOS_ADD_BENCHMARK(
Benchmark_Atomic_MinMax
SOURCES test_atomic_minmax_simple.cpp
)
ENDIF()
# FIXME_NVHPC
IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
KOKKOS_ADD_BENCHMARK(
PerformanceTest_Mempool
SOURCES test_mempool.cpp
)
ENDIF()
KOKKOS_ADD_BENCHMARK(
PerformanceTest_Atomic
SOURCES test_atomic.cpp
)

View File

@ -14,7 +14,7 @@ else
CXX = g++
endif
CXXFLAGS = -O3
CXXFLAGS = -O3
#CXXFLAGS += -DGENERIC_REDUCER
LINK ?= $(CXX)
LDFLAGS ?=
@ -29,43 +29,12 @@ TARGETS =
#
OBJ_PERF = PerfTestMain.o gtest-all.o
OBJ_PERF += PerfTest_ExecSpacePartitioning.o
OBJ_PERF += PerfTestGramSchmidt.o
OBJ_PERF += PerfTestHexGrad.o
OBJ_PERF += PerfTest_CustomReduction.o
OBJ_PERF += PerfTest_ViewAllocate.o
OBJ_PERF += PerfTest_ViewFill_123.o PerfTest_ViewFill_45.o PerfTest_ViewFill_6.o PerfTest_ViewFill_7.o PerfTest_ViewFill_8.o
OBJ_PERF += PerfTest_ViewResize_123.o PerfTest_ViewResize_45.o PerfTest_ViewResize_6.o PerfTest_ViewResize_7.o PerfTest_ViewResize_8.o
TARGETS += KokkosCore_PerformanceTest
TEST_TARGETS += test-performance
#
OBJ_ATOMICS = test_atomic.o
TARGETS += KokkosCore_PerformanceTest_Atomics
TEST_TARGETS += test-atomic
#
OBJ_MEMPOOL = test_mempool.o
TARGETS += KokkosCore_PerformanceTest_Mempool
TEST_TARGETS += test-mempool
#
OBJ_TASKDAG = test_taskdag.o
OBJ_TASKDAG = test_taskdag.o
TARGETS += KokkosCore_PerformanceTest_TaskDAG
TEST_TARGETS += test-taskdag
#
OBJ_ATOMICS_MINMAX = test_atomic_minmax_simple.o
TARGETS += KokkosCore_PerformanceTest_Atomics_MinMax
TEST_TARGETS += test-atomic-minmax
#
KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest

View File

@ -25,8 +25,8 @@ template <class Type>
struct Dot {
using execution_space = typename Type::execution_space;
static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
"Dot static_assert Fail: Rank != 1");
static_assert(static_cast<unsigned>(Type::rank) == static_cast<unsigned>(1),
"Dot static_assert Fail: rank != 1");
using value_type = double;
@ -56,8 +56,8 @@ template <class Type>
struct DotSingle {
using execution_space = typename Type::execution_space;
static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
"DotSingle static_assert Fail: Rank != 1");
static_assert(static_cast<unsigned>(Type::rank) == static_cast<unsigned>(1),
"DotSingle static_assert Fail: rank != 1");
using value_type = double;
@ -88,13 +88,13 @@ template <class ScalarType, class VectorType>
struct Scale {
using execution_space = typename VectorType::execution_space;
static_assert(static_cast<unsigned>(ScalarType::Rank) ==
static_assert(static_cast<unsigned>(ScalarType::rank) ==
static_cast<unsigned>(0),
"Scale static_assert Fail: ScalarType::Rank != 0");
"Scale static_assert Fail: ScalarType::rank != 0");
static_assert(static_cast<unsigned>(VectorType::Rank) ==
static_assert(static_cast<unsigned>(VectorType::rank) ==
static_cast<unsigned>(1),
"Scale static_assert Fail: VectorType::Rank != 1");
"Scale static_assert Fail: VectorType::rank != 1");
#if 1
typename ScalarType::const_type alpha;
@ -115,17 +115,17 @@ template <class ScalarType, class ConstVectorType, class VectorType>
struct AXPBY {
using execution_space = typename VectorType::execution_space;
static_assert(static_cast<unsigned>(ScalarType::Rank) ==
static_assert(static_cast<unsigned>(ScalarType::rank) ==
static_cast<unsigned>(0),
"AXPBY static_assert Fail: ScalarType::Rank != 0");
"AXPBY static_assert Fail: ScalarType::rank != 0");
static_assert(static_cast<unsigned>(ConstVectorType::Rank) ==
static_assert(static_cast<unsigned>(ConstVectorType::rank) ==
static_cast<unsigned>(1),
"AXPBY static_assert Fail: ConstVectorType::Rank != 1");
"AXPBY static_assert Fail: ConstVectorType::rank != 1");
static_assert(static_cast<unsigned>(VectorType::Rank) ==
static_assert(static_cast<unsigned>(VectorType::rank) ==
static_cast<unsigned>(1),
"AXPBY static_assert Fail: VectorType::Rank != 1");
"AXPBY static_assert Fail: VectorType::rank != 1");
#if 1
typename ScalarType::const_type alpha, beta;

View File

@ -15,11 +15,11 @@
//@HEADER
#include <Kokkos_Core.hpp>
#include <gtest/gtest.h>
#include <PerfTest_Category.hpp>
#include <benchmark/benchmark.h>
#include "PerfTest_Category.hpp"
#include <cmath>
#include <PerfTestBlasKernels.hpp>
#include "PerfTestBlasKernels.hpp"
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -137,87 +137,61 @@ struct ModifiedGramSchmidt {
//--------------------------------------------------------------------------
static double test(const size_type length, const size_type count,
const size_t iter = 1) {
static double test(const size_type length, const size_type count) {
multivector_type Q_("Q", length, count);
multivector_type R_("R", count, count);
typename multivector_type::HostMirror A = Kokkos::create_mirror(Q_);
// Create and fill A on the host
for (size_type j = 0; j < count; ++j) {
for (size_type i = 0; i < length; ++i) {
A(i, j) = (i + 1) * (j + 1);
}
}
double dt_min = 0;
Kokkos::deep_copy(Q_, A);
for (size_t i = 0; i < iter; ++i) {
Kokkos::deep_copy(Q_, A);
// A = Q * R
const double dt = factorization(Q_, R_);
// A = Q * R
const double dt = factorization(Q_, R_);
if (0 == i)
dt_min = dt;
else
dt_min = dt < dt_min ? dt : dt_min;
}
return dt_min;
return dt;
}
};
template <class DeviceType>
void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials,
const char deviceTypeName[]) {
std::string label_gramschmidt;
label_gramschmidt.append("\"GramSchmidt< double , ");
label_gramschmidt.append(deviceTypeName);
label_gramschmidt.append(" >\"");
template <class Scalar>
static void GramSchmidt(benchmark::State& state) {
const int parallel_work_length = state.range(0);
for (int i = exp_beg; i < exp_end; ++i) {
double min_seconds = 0.0;
double max_seconds = 0.0;
double avg_seconds = 0.0;
for (auto _ : state) {
const double seconds =
ModifiedGramSchmidt<Scalar, Kokkos::DefaultExecutionSpace>::test(
parallel_work_length, 32);
const int parallel_work_length = 1 << i;
for (int j = 0; j < num_trials; ++j) {
const double seconds = ModifiedGramSchmidt<double, DeviceType>::test(
parallel_work_length, 32);
if (0 == j) {
min_seconds = seconds;
max_seconds = seconds;
} else {
if (seconds < min_seconds) min_seconds = seconds;
if (seconds > max_seconds) max_seconds = seconds;
}
avg_seconds += seconds;
}
avg_seconds /= num_trials;
std::cout << label_gramschmidt << " , " << parallel_work_length << " , "
<< min_seconds << " , " << (min_seconds / parallel_work_length)
<< ", " << avg_seconds << std::endl;
state.SetIterationTime(seconds);
state.counters["Count"] = benchmark::Counter(parallel_work_length);
state.counters["Time normalized"] =
benchmark::Counter(seconds / parallel_work_length);
}
}
TEST(default_exec, gramschmidt) {
int exp_beg = 10;
int exp_end = 20;
int num_trials = 5;
if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1));
if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2));
if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
EXPECT_NO_THROW(run_test_gramschmidt<Kokkos::DefaultExecutionSpace>(
exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()));
}
// FIXME_SYCL SYCL+Cuda reports "an illegal memory access was encountered"
#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
BENCHMARK(GramSchmidt<double>)
->ArgName("Count")
->ArgsProduct({
benchmark::CreateRange(1 << 10, 1 << 18, 2),
})
->UseManualTime()
->Iterations(5);
#else
BENCHMARK(GramSchmidt<double>)
->ArgName("Count")
->ArgsProduct({
benchmark::CreateRange(1 << 10, 1 << 19, 2),
})
->UseManualTime()
->Iterations(5);
#endif
} // namespace Test

View File

@ -15,8 +15,9 @@
//@HEADER
#include <Kokkos_Core.hpp>
#include <gtest/gtest.h>
#include <PerfTest_Category.hpp>
#include <benchmark/benchmark.h>
#include "Benchmark_Context.hpp"
#include "PerfTest_Category.hpp"
namespace Test {
@ -195,78 +196,43 @@ struct HexGrad {
//--------------------------------------------------------------------------
static double test(const int count, const int iter = 1) {
static double test(const int count) {
elem_coord_type coord("coord", count);
elem_grad_type grad("grad", count);
// Execute the parallel kernels on the arrays:
double dt_min = 0;
Kokkos::parallel_for(count, Init(coord));
execution_space().fence();
for (int i = 0; i < iter; ++i) {
Kokkos::Timer timer;
Kokkos::parallel_for(count, HexGrad<execution_space>(coord, grad));
execution_space().fence();
const double dt = timer.seconds();
if (0 == i)
dt_min = dt;
else
dt_min = dt < dt_min ? dt : dt_min;
}
return dt_min;
Kokkos::Timer timer;
Kokkos::parallel_for(count, HexGrad<execution_space>(coord, grad));
execution_space().fence();
return timer.seconds();
}
};
template <class DeviceType>
void run_test_hexgrad(int exp_beg, int exp_end, int num_trials,
const char deviceTypeName[]) {
std::string label_hexgrad;
label_hexgrad.append("\"HexGrad< double , ");
label_hexgrad.append(deviceTypeName);
label_hexgrad.append(" >\"");
template <class CoordScalarType>
static void HexGrad_Benchmark(benchmark::State& state) {
const auto parallel_work_length = state.range(0);
for (int i = exp_beg; i < exp_end; ++i) {
double min_seconds = 0.0;
double max_seconds = 0.0;
double avg_seconds = 0.0;
for (auto _ : state) {
const auto time =
HexGrad<Kokkos::DefaultExecutionSpace, CoordScalarType>::test(
parallel_work_length);
const int parallel_work_length = 1 << i;
for (int j = 0; j < num_trials; ++j) {
const double seconds = HexGrad<DeviceType>::test(parallel_work_length);
if (0 == j) {
min_seconds = seconds;
max_seconds = seconds;
} else {
if (seconds < min_seconds) min_seconds = seconds;
if (seconds > max_seconds) max_seconds = seconds;
}
avg_seconds += seconds;
}
avg_seconds /= num_trials;
std::cout << label_hexgrad << " , " << parallel_work_length << " , "
<< min_seconds << " , " << (min_seconds / parallel_work_length)
<< avg_seconds << std::endl;
state.SetIterationTime(time);
state.counters["Count"] = benchmark::Counter(parallel_work_length);
state.counters["Time normalized"] =
benchmark::Counter(time / parallel_work_length);
}
}
TEST(default_exec, hexgrad) {
int exp_beg = 10;
int exp_end = 20;
int num_trials = 5;
if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1));
if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2));
if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
EXPECT_NO_THROW(run_test_hexgrad<Kokkos::DefaultExecutionSpace>(
exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()));
}
BENCHMARK(HexGrad_Benchmark<double>)
->ArgName("count")
->ArgsProduct({
benchmark::CreateRange(1 << 10, 1 << 19, 2),
})
->UseManualTime()
->Iterations(5);
} // namespace Test

View File

@ -18,24 +18,7 @@
#include <cstdlib>
#include <Kokkos_Core.hpp>
namespace Test {
int command_line_num_args(int n = 0) {
static int n_args = 0;
if (n > 0) n_args = n;
return n_args;
}
const char* command_line_arg(int k, char** input_args = nullptr) {
static char** args;
if (input_args != nullptr) args = input_args;
if (command_line_num_args() > k)
return args[k];
else
return nullptr;
}
} // namespace Test
#include <PerfTest_Category.hpp>
int main(int argc, char* argv[]) {
::testing::InitGoogleTest(&argc, argv);

View File

@ -17,12 +17,22 @@
#ifndef KOKKOS_TEST_PERFTEST_CAT_HPP
#define KOKKOS_TEST_PERFTEST_CAT_HPP
#include <gtest/gtest.h>
namespace Test {
extern int command_line_num_args(int n = 0);
extern const char* command_line_arg(int k, char** input_args = nullptr);
inline int command_line_num_args(int n = 0) {
static int n_args = 0;
if (n > 0) n_args = n;
return n_args;
}
inline const char* command_line_arg(int k, char** input_args = nullptr) {
static char** args;
if (input_args != nullptr) args = input_args;
if (command_line_num_args() > k)
return args[k];
else
return nullptr;
}
} // namespace Test

View File

@ -15,14 +15,16 @@
//@HEADER
#include <Kokkos_Core.hpp>
#include <gtest/gtest.h>
#include <PerfTest_Category.hpp>
#include <benchmark/benchmark.h>
#include "Benchmark_Context.hpp"
#include "PerfTest_Category.hpp"
#include <Kokkos_Random.hpp>
#include <utility>
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
namespace Test {
template <class Scalar>
void custom_reduction_test(int N, int R, int num_trials) {
std::pair<double, Scalar> custom_reduction_test(int N, int R) {
Kokkos::Random_XorShift64_Pool<> rand_pool(183291);
Kokkos::View<Scalar*> a("A", N);
Kokkos::fill_random(a, rand_pool, 1.0);
@ -62,49 +64,70 @@ void custom_reduction_test(int N, int R, int num_trials) {
// Timing
Kokkos::Timer timer;
for (int r = 0; r < num_trials; r++) {
Kokkos::parallel_reduce(
Kokkos::TeamPolicy<>(N / 1024, team_size),
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team,
Scalar& lmax) {
Scalar team_max = Scalar(0);
for (int rr = 0; rr < R; rr++) {
int i = team.league_rank();
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange(team, 32),
[&](const int& j, Scalar& thread_max) {
Scalar t_max = Scalar(0);
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(team, 32),
[&](const int& k, Scalar& max_) {
const Scalar val = a((i * 32 + j) * 32 + k);
if (val > max_) max_ = val;
if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5;
},
Kokkos::Max<Scalar>(t_max));
if (t_max > thread_max) thread_max = t_max;
},
Kokkos::Max<Scalar>(team_max));
}
if (team_max > lmax) lmax = team_max;
},
Kokkos::Max<Scalar>(max));
Kokkos::parallel_reduce(
Kokkos::TeamPolicy<>(N / 1024, team_size),
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team,
Scalar& lmax) {
Scalar team_max = Scalar(0);
for (int rr = 0; rr < R; rr++) {
int i = team.league_rank();
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange(team, 32),
[&](const int& j, Scalar& thread_max) {
Scalar t_max = Scalar(0);
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(team, 32),
[&](const int& k, Scalar& max_) {
const Scalar val = a((i * 32 + j) * 32 + k);
if (val > max_) max_ = val;
if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5;
},
Kokkos::Max<Scalar>(t_max));
if (t_max > thread_max) thread_max = t_max;
},
Kokkos::Max<Scalar>(team_max));
}
if (team_max > lmax) lmax = team_max;
},
Kokkos::Max<Scalar>(max));
return std::make_pair(timer.seconds(), max);
}
int get_N(benchmark::State& state) {
return (Test::command_line_num_args() > 1)
? std::stoi(Test::command_line_arg(1))
: state.range(0);
}
int get_R(benchmark::State& state) {
return (Test::command_line_num_args() > 2)
? std::stoi(Test::command_line_arg(2))
: state.range(1);
}
template <class Scalar>
static void CustomReduction(benchmark::State& state) {
int N = get_N(state);
int R = get_R(state);
for (auto _ : state) {
auto results = custom_reduction_test<double>(N, R);
// data processed in gigabytes
const double data_processed =
N * R * sizeof(Scalar) / results.first / 1'000'000'000;
state.SetIterationTime(results.first);
state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
data_processed, benchmark::Counter::kIsIterationInvariantRate);
state.counters["Max"] = benchmark::Counter(results.second);
}
double time = timer.seconds();
printf("%e %e %e\n", time,
1.0 * N * R * num_trials * sizeof(Scalar) / time / 1024 / 1024 / 1024,
max);
}
TEST(default_exec, custom_reduction) {
int N = 100000;
int R = 1000;
int num_trials = 1;
BENCHMARK(CustomReduction<double>)
->ArgNames({"N", "R"})
->Args({100'000, 1'000})
->UseManualTime();
if (command_line_num_args() > 1) N = std::stoi(command_line_arg(1));
if (command_line_num_args() > 2) R = std::stoi(command_line_arg(2));
if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
custom_reduction_test<double>(N, R, num_trials);
}
} // namespace Test
#endif

File diff suppressed because it is too large Load Diff

View File

@ -15,119 +15,218 @@
//@HEADER
#include <Kokkos_Core.hpp>
#include <gtest/gtest.h>
#include <cstdio>
#include <PerfTest_Category.hpp>
#include <benchmark/benchmark.h>
#include "Benchmark_Context.hpp"
namespace Test {
static constexpr int N = 10;
template <class Layout>
void run_allocateview_tests(int N, int R) {
const int N1 = N;
const int N2 = N * N;
const int N3 = N2 * N;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
static void ViewAllocate_Rank1(benchmark::State& state) {
const int N8 = std::pow(state.range(0), 8);
double time1, time2, time3, time4, time5, time6, time7, time8,
time_raw = 100000.0;
{
for (auto _ : state) {
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*, Layout> a("A1", N8);
}
time1 = timer.seconds() / R;
Kokkos::View<double*, Layout> a("A1", N8);
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
}
{
}
template <class Layout>
static void ViewAllocate_Rank2(benchmark::State& state) {
const int N4 = std::pow(state.range(0), 4);
for (auto _ : state) {
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double**, Layout> a("A2", N4, N4);
}
time2 = timer.seconds() / R;
Kokkos::View<double**, Layout> a("A2", N4, N4);
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
}
{
}
template <class Layout>
static void ViewAllocate_Rank3(benchmark::State& state) {
const int N2 = std::pow(state.range(0), 2);
const int N3 = std::pow(state.range(0), 3);
for (auto _ : state) {
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
}
time3 = timer.seconds() / R;
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
}
{
}
template <class Layout>
static void ViewAllocate_Rank4(benchmark::State& state) {
const int N2 = std::pow(state.range(0), 2);
for (auto _ : state) {
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
}
time4 = timer.seconds() / R;
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
}
{
}
template <class Layout>
static void ViewAllocate_Rank5(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
for (auto _ : state) {
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
}
time5 = timer.seconds() / R;
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
}
{
}
template <class Layout>
static void ViewAllocate_Rank6(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
for (auto _ : state) {
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
}
time6 = timer.seconds() / R;
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
}
{
}
template <class Layout>
static void ViewAllocate_Rank7(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
for (auto _ : state) {
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
}
time7 = timer.seconds() / R;
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
}
{
}
template <class Layout>
static void ViewAllocate_Rank8(benchmark::State& state) {
const int N1 = state.range(0);
for (auto _ : state) {
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
N1);
}
time8 = timer.seconds() / R;
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
N1);
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
}
}
template <class Layout>
static void ViewAllocate_Raw(benchmark::State& state) {
const int N8 = std::pow(state.range(0), 8);
for (auto _ : state) {
Kokkos::Timer timer;
double* a_ptr =
static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
Kokkos::fence();
const auto time = timer.seconds();
Kokkos::kokkos_free(a_ptr);
state.SetIterationTime(time);
// data processed in megabytes
const double data_processed = 1 * N8 * sizeof(double) / 1'000'000;
state.counters["MB"] = benchmark::Counter(data_processed);
state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
data_processed / 1'000, benchmark::Counter::kIsIterationInvariantRate);
}
}
BENCHMARK(ViewAllocate_Rank1<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank1<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank2<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank2<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank3<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank3<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank4<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank4<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank5<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank5<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank6<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank6<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank7<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank7<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank8<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewAllocate_Rank8<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
double* a_ptr =
static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
Kokkos::fence();
Kokkos::kokkos_free(a_ptr);
}
time_raw = timer.seconds() / R;
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
size / 1024 / time_raw);
printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size,
size / 1024 / time1);
printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size,
size / 1024 / time2);
printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size,
size / 1024 / time3);
printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size,
size / 1024 / time4);
printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size,
size / 1024 / time5);
printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size,
size / 1024 / time6);
printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size,
size / 1024 / time7);
printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size,
size / 1024 / time8);
}
BENCHMARK(ViewAllocate_Raw<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
TEST(default_exec, ViewCreate) {
printf("Create View Performance for LayoutLeft:\n");
run_allocateview_tests<Kokkos::LayoutLeft>(10, 1);
printf("Create View Performance for LayoutRight:\n");
run_allocateview_tests<Kokkos::LayoutRight>(10, 1);
}
BENCHMARK(ViewAllocate_Raw<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
#endif
} // namespace Test

View File

@ -18,32 +18,15 @@
#define KOKKOS_CORE_PERFTEST_BENCHMARK_VIEW_COPY_HPP
#include <Kokkos_Core.hpp>
#include <benchmark/benchmark.h>
#include "Benchmark_Context.hpp"
#include <cmath>
namespace Test {
/**
* \brief Mark the label as a figure of merit.
*/
inline std::string benchmark_fom(const std::string& label) {
return "FOM: " + label;
}
inline void report_results(benchmark::State& state, std::size_t num_elems,
double time) {
state.SetIterationTime(time);
// data size in megabytes
const auto size = 1.0 * num_elems * sizeof(double) / 1000 / 1000;
// data processed in gigabytes
const auto data_processed = 2 * size / 1000;
state.counters["MB"] =
benchmark::Counter(size, benchmark::Counter::kDefaults);
state.counters[benchmark_fom("GB/s")] = benchmark::Counter(
data_processed, benchmark::Counter::kIsIterationInvariantRate);
}
static constexpr int DATA_RATIO = 2;
template <class ViewTypeA, class ViewTypeB>
void deepcopy_view(ViewTypeA& a, ViewTypeB& b, benchmark::State& state) {
@ -51,7 +34,7 @@ void deepcopy_view(ViewTypeA& a, ViewTypeB& b, benchmark::State& state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::deep_copy(a, b);
report_results(state, a.size(), timer.seconds());
KokkosBenchmark::report_results(state, a, DATA_RATIO, timer.seconds());
}
}
@ -158,8 +141,7 @@ static void ViewDeepCopy_Raw(benchmark::State& state) {
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = b_ptr[i]; });
Kokkos::fence();
report_results(state, a.size(), timer.seconds());
KokkosBenchmark::report_results(state, a, DATA_RATIO, timer.seconds());
}
}

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,7 +14,7 @@
//
//@HEADER
#include <PerfTest_ViewCopy.hpp>
#include "PerfTest_ViewCopy.hpp"
namespace Test {

View File

@ -14,202 +14,115 @@
//
//@HEADER
#include <Kokkos_Core.hpp>
#include <gtest/gtest.h>
#include <cstdio>
#include <PerfTest_Category.hpp>
#include "Benchmark_Context.hpp"
#include <cmath>
namespace Test {
static constexpr int N = 10;
template <class ViewType>
double fill_view(ViewType& a, typename ViewType::const_value_type& val,
int repeat) {
Kokkos::Timer timer;
for (int i = 0; i < repeat; i++) {
void fill_view(ViewType& a, typename ViewType::const_value_type& val,
benchmark::State& state) {
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::deep_copy(a, val);
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
}
Kokkos::fence();
return timer.seconds();
}
template <class Layout>
void run_fillview_tests123(int N, int R) {
const int N1 = N;
static void ViewFill_Rank1(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
Kokkos::View<double*, Layout> a("A1", N8);
fill_view(a, 1.1, state);
}
template <class Layout>
static void ViewFill_Rank2(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
const int N4 = N2 * N2;
Kokkos::View<double**, Layout> a("A2", N4, N4);
fill_view(a, 1.1, state);
}
template <class Layout>
static void ViewFill_Rank3(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
const int N3 = N2 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
double time1, time2, time3, time_raw = 100000.0;
{
Kokkos::View<double*, Layout> a("A1", N8);
time1 = fill_view(a, 1.1, R) / R;
}
{
Kokkos::View<double**, Layout> a("A2", N4, N4);
time2 = fill_view(a, 1.1, R) / R;
}
{
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
time3 = fill_view(a, 1.1, R) / R;
}
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
}
Kokkos::fence();
time_raw = timer.seconds() / R;
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
size / 1024 / time_raw);
printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size,
size / 1024 / time1);
printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size,
size / 1024 / time2);
printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size,
size / 1024 / time3);
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
fill_view(a, 1.1, state);
}
template <class Layout>
void run_fillview_tests45(int N, int R) {
const int N1 = N;
static void ViewFill_Rank4(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
double time4, time5, time_raw = 100000.0;
{
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
time4 = fill_view(a, 1.1, R) / R;
}
{
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
time5 = fill_view(a, 1.1, R) / R;
}
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
}
Kokkos::fence();
time_raw = timer.seconds() / R;
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
size / 1024 / time_raw);
printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size,
size / 1024 / time4);
printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size,
size / 1024 / time5);
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
fill_view(a, 1.1, state);
}
template <class Layout>
void run_fillview_tests6(int N, int R) {
const int N1 = N;
static void ViewFill_Rank5(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
double time6, time_raw = 100000.0;
{
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
time6 = fill_view(a, 1.1, R) / R;
}
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
}
Kokkos::fence();
time_raw = timer.seconds() / R;
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
size / 1024 / time_raw);
printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size,
size / 1024 / time6);
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
fill_view(a, 1.1, state);
}
template <class Layout>
void run_fillview_tests7(int N, int R) {
const int N1 = N;
static void ViewFill_Rank6(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
double time7, time_raw = 100000.0;
{
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
time7 = fill_view(a, 1.1, R) / R;
}
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
}
Kokkos::fence();
time_raw = timer.seconds() / R;
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
size / 1024 / time_raw);
printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size,
size / 1024 / time7);
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
fill_view(a, 1.1, state);
}
template <class Layout>
void run_fillview_tests8(int N, int R) {
const int N1 = N;
static void ViewFill_Rank7(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
double time8, time_raw = 100000.0;
{
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
N1);
time8 = fill_view(a, 1.1, R) / R;
}
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
fill_view(a, 1.1, state);
}
template <class Layout>
static void ViewFill_Rank8(benchmark::State& state) {
const int N1 = state.range(0);
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, N1);
fill_view(a, 1.1, state);
}
template <class Layout>
static void ViewFill_Raw(benchmark::State& state) {
const int N8 = std::pow(state.range(0), 8);
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
for (auto _ : state) {
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
}
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
Kokkos::fence();
time_raw = timer.seconds() / R;
KokkosBenchmark::report_results(state, a, 1, timer.seconds());
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
size / 1024 / time_raw);
printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size,
size / 1024 / time8);
}
} // namespace Test

View File

@ -14,13 +14,38 @@
//
//@HEADER
#include <PerfTest_ViewFill.hpp>
#include "PerfTest_ViewFill.hpp"
namespace Test {
TEST(default_exec, ViewFill_Rank123) {
printf("ViewFill Performance for LayoutLeft:\n");
run_fillview_tests123<Kokkos::LayoutLeft>(10, 1);
printf("ViewFill Performance for LayoutRight:\n");
run_fillview_tests123<Kokkos::LayoutRight>(10, 1);
}
BENCHMARK(ViewFill_Rank1<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank1<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank2<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank2<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank3<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank3<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
} // namespace Test

View File

@ -14,13 +14,28 @@
//
//@HEADER
#include <PerfTest_ViewFill.hpp>
#include "PerfTest_ViewFill.hpp"
namespace Test {
TEST(default_exec, ViewFill_Rank45) {
printf("ViewFill Performance for LayoutLeft:\n");
run_fillview_tests45<Kokkos::LayoutLeft>(10, 1);
printf("ViewFill Performance for LayoutRight:\n");
run_fillview_tests45<Kokkos::LayoutRight>(10, 1);
}
BENCHMARK(ViewFill_Rank4<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank4<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank5<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank5<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
} // namespace Test

View File

@ -14,13 +14,18 @@
//
//@HEADER
#include <PerfTest_ViewFill.hpp>
#include "PerfTest_ViewFill.hpp"
namespace Test {
TEST(default_exec, ViewFill_Rank6) {
printf("ViewFill Performance for LayoutLeft:\n");
run_fillview_tests6<Kokkos::LayoutLeft>(10, 1);
printf("ViewFill Performance for LayoutRight:\n");
run_fillview_tests6<Kokkos::LayoutRight>(10, 1);
}
BENCHMARK(ViewFill_Rank6<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank6<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
} // namespace Test

View File

@ -14,13 +14,18 @@
//
//@HEADER
#include <PerfTest_ViewFill.hpp>
#include "PerfTest_ViewFill.hpp"
namespace Test {
TEST(default_exec, ViewFill_Rank7) {
printf("ViewFill Performance for LayoutLeft:\n");
run_fillview_tests7<Kokkos::LayoutLeft>(10, 1);
printf("ViewFill Performance for LayoutRight:\n");
run_fillview_tests7<Kokkos::LayoutRight>(10, 1);
}
BENCHMARK(ViewFill_Rank7<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank7<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
} // namespace Test

View File

@ -14,13 +14,18 @@
//
//@HEADER
#include <PerfTest_ViewFill.hpp>
#include "PerfTest_ViewFill.hpp"
namespace Test {
TEST(default_exec, ViewFill_Rank8) {
printf("ViewFill Performance for LayoutLeft:\n");
run_fillview_tests8<Kokkos::LayoutLeft>(10, 1);
printf("ViewFill Performance for LayoutRight:\n");
run_fillview_tests8<Kokkos::LayoutRight>(10, 1);
}
BENCHMARK(ViewFill_Rank8<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Rank8<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
} // namespace Test

View File

@ -0,0 +1,33 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#include "PerfTest_ViewFill.hpp"
namespace Test {
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
BENCHMARK(ViewFill_Raw<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();
BENCHMARK(ViewFill_Raw<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
#endif
} // namespace Test

View File

@ -15,346 +15,291 @@
//@HEADER
#include <Kokkos_Core.hpp>
#include <gtest/gtest.h>
#include <cstdio>
#include <PerfTest_Category.hpp>
#include <benchmark/benchmark.h>
#include <cmath>
#include "Benchmark_Context.hpp"
namespace Test {
template <class Layout>
void run_resizeview_tests123(int N, int R) {
const int N1 = N;
const int N2 = N1 * N1;
const int N3 = N2 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
static constexpr int R = 10;
static constexpr int N = 10;
double time1, time2, time3, time_raw = 100000.0;
double time1_noinit, time2_noinit, time3_noinit;
{
Kokkos::View<double*, Layout> a("A1", N8);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*, Layout> a_(a);
Kokkos::resize(a_, int(N8 * 1.1));
}
time1 = timer.seconds() / R;
}
{
Kokkos::View<double**, Layout> a("A2", N4, N4);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double**, Layout> a_(a);
Kokkos::resize(a_, int(N4 * 1.1), N4);
}
time2 = timer.seconds() / R;
}
{
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double***, Layout> a_(a);
Kokkos::resize(a_, int(N3 * 1.1), N3, N2);
}
time3 = timer.seconds() / R;
}
{
Kokkos::View<double*, Layout> a("A1", N8);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*, Layout> a_(a);
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N8 * 1.1));
}
time1_noinit = timer.seconds() / R;
}
{
Kokkos::View<double**, Layout> a("A2", N4, N4);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double**, Layout> a_(a);
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N4 * 1.1), N4);
}
time2_noinit = timer.seconds() / R;
}
{
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double***, Layout> a_(a);
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N3 * 1.1), N3, N2);
}
time3_noinit = timer.seconds() / R;
}
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*, Layout> a1(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
double* a1_ptr = a1.data();
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
Kokkos::fence();
}
template <class Layout>
static void ViewResize_Rank1(benchmark::State& state) {
const int N8 = std::pow(state.range(0), 8);
Kokkos::View<double*, Layout> a("A1", N8);
Kokkos::View<double*, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
time_raw = timer.seconds() / R;
Kokkos::Timer timer;
Kokkos::resize(a_, int(N8 * 1.1));
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
2.0 * size / 1024 / time_raw);
printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size,
2.0 * size / 1024 / time1);
printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size,
2.0 * size / 1024 / time2);
printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size,
2.0 * size / 1024 / time3);
printf(" Rank1 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
time1_noinit, size, 2.0 * size / 1024 / time1_noinit);
printf(" Rank2 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
time2_noinit, size, 2.0 * size / 1024 / time2_noinit);
printf(" Rank3 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
time3_noinit, size, 2.0 * size / 1024 / time3_noinit);
}
template <class Layout>
void run_resizeview_tests45(int N, int R) {
const int N1 = N;
const int N2 = N1 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
static void ViewResize_Rank2(benchmark::State& state) {
const int N4 = std::pow(state.range(0), 4);
Kokkos::View<double**, Layout> a("A2", N4, N4);
Kokkos::View<double**, Layout> a_(a);
double time4, time5, time_raw = 100000.0;
double time4_noinit, time5_noinit;
{
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double****, Layout> a_(a);
Kokkos::resize(a_, int(N2 * 1.1), N2, N2, N2);
}
time4 = timer.seconds() / R;
}
{
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*****, Layout> a_(a);
Kokkos::resize(a_, int(N2 * 1.1), N2, N1, N1, N2);
}
time5 = timer.seconds() / R;
}
{
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double****, Layout> a_(a);
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N2,
N2);
}
time4_noinit = timer.seconds() / R;
}
{
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*****, Layout> a_(a);
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N1, N1,
N2);
}
time5_noinit = timer.seconds() / R;
}
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*, Layout> a1(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
double* a1_ptr = a1.data();
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
Kokkos::fence();
}
for (auto _ : state) {
Kokkos::fence();
time_raw = timer.seconds() / R;
Kokkos::Timer timer;
Kokkos::resize(a_, int(N4 * 1.1), N4);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
2.0 * size / 1024 / time_raw);
printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size,
2.0 * size / 1024 / time4);
printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size,
2.0 * size / 1024 / time5);
printf(" Rank4 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
time4_noinit, size, 2.0 * size / 1024 / time4_noinit);
printf(" Rank5 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
time5_noinit, size, 2.0 * size / 1024 / time5_noinit);
}
template <class Layout>
void run_resizeview_tests6(int N, int R) {
const int N1 = N;
const int N2 = N1 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
static void ViewResize_Rank3(benchmark::State& state) {
const int N2 = std::pow(state.range(0), 2);
const int N3 = std::pow(state.range(0), 3);
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
Kokkos::View<double***, Layout> a_(a);
double time6, time6_noinit, time_raw = 100000.0;
{
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double******, Layout> a_(a);
Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N2);
}
time6 = timer.seconds() / R;
}
{
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double******, Layout> a_(a);
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
N1, N2);
}
time6_noinit = timer.seconds() / R;
}
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*, Layout> a1(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
double* a1_ptr = a1.data();
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
Kokkos::fence();
}
for (auto _ : state) {
Kokkos::fence();
time_raw = timer.seconds() / R;
Kokkos::Timer timer;
Kokkos::resize(a_, int(N3 * 1.1), N3, N2);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
2.0 * size / 1024 / time_raw);
printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size,
2.0 * size / 1024 / time6);
printf(" Rank6 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
time6_noinit, size, 2.0 * size / 1024 / time6_noinit);
}
template <class Layout>
void run_resizeview_tests7(int N, int R) {
const int N1 = N;
const int N2 = N1 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
static void ViewResize_Rank4(benchmark::State& state) {
const int N2 = std::pow(state.range(0), 2);
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
Kokkos::View<double****, Layout> a_(a);
double time7, time7_noinit, time_raw = 100000.0;
{
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*******, Layout> a_(a);
Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1);
}
time7 = timer.seconds() / R;
}
{
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*******, Layout> a_(a);
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
N1, N1, N1);
}
time7_noinit = timer.seconds() / R;
}
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*, Layout> a1(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
double* a1_ptr = a1.data();
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
Kokkos::fence();
}
for (auto _ : state) {
Kokkos::fence();
time_raw = timer.seconds() / R;
Kokkos::Timer timer;
Kokkos::resize(a_, int(N2 * 1.1), N2, N2, N2);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
2.0 * size / 1024 / time_raw);
printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size,
2.0 * size / 1024 / time7);
printf(" Rank7 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
time7_noinit, size, 2.0 * size / 1024 / time7_noinit);
}
template <class Layout>
void run_resizeview_tests8(int N, int R) {
const int N1 = N;
static void ViewResize_Rank5(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
const int N4 = N2 * N2;
const int N8 = N4 * N4;
double time8, time8_noinit, time_raw = 100000.0;
{
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
N1);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double********, Layout> a_(a);
Kokkos::resize(a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1);
}
time8 = timer.seconds() / R;
}
{
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
N1);
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double********, Layout> a_(a);
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N1 * 1.1), N1, N1, N1,
N1, N1, N1, N1);
}
time8_noinit = timer.seconds() / R;
}
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
{
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
Kokkos::Timer timer;
for (int r = 0; r < R; r++) {
Kokkos::View<double*, Layout> a1(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
double* a1_ptr = a1.data();
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
Kokkos::fence();
}
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
Kokkos::View<double*****, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
time_raw = timer.seconds() / R;
Kokkos::Timer timer;
Kokkos::resize(a_, int(N2 * 1.1), N2, N1, N1, N2);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_Rank6(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
Kokkos::View<double******, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N2);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_Rank7(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
Kokkos::View<double*******, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_Rank8(benchmark::State& state) {
const int N1 = state.range(0);
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, N1);
Kokkos::View<double********, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_NoInit_Rank1(benchmark::State& state) {
const int N8 = std::pow(state.range(0), 8);
Kokkos::View<double*, Layout> a("A1", N8);
Kokkos::View<double*, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N8 * 1.1));
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_NoInit_Rank2(benchmark::State& state) {
const int N4 = std::pow(state.range(0), 4);
Kokkos::View<double**, Layout> a("A2", N4, N4);
Kokkos::View<double**, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N4 * 1.1), N4);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_NoInit_Rank3(benchmark::State& state) {
const int N2 = std::pow(state.range(0), 2);
const int N3 = std::pow(state.range(0), 3);
Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
Kokkos::View<double***, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N3 * 1.1), N3, N2);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_NoInit_Rank4(benchmark::State& state) {
const int N2 = std::pow(state.range(0), 2);
Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
Kokkos::View<double****, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N2, N2);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_NoInit_Rank5(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
Kokkos::View<double*****, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N1, N1,
N2);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_NoInit_Rank6(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
Kokkos::View<double******, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
N1, N2);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_NoInit_Rank7(benchmark::State& state) {
const int N1 = state.range(0);
const int N2 = N1 * N1;
Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
Kokkos::View<double*******, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
N1, N1, N1);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_NoInit_Rank8(benchmark::State& state) {
const int N1 = state.range(0);
Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, N1);
Kokkos::View<double********, Layout> a_(a);
for (auto _ : state) {
Kokkos::fence();
Kokkos::Timer timer;
Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N1 * 1.1), N1, N1, N1,
N1, N1, N1, N1);
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
}
template <class Layout>
static void ViewResize_NoInit_Raw(benchmark::State& state) {
const int N8 = std::pow(state.range(0), 8);
Kokkos::View<double*, Layout> a("A1", N8);
double* a_ptr = a.data();
for (auto _ : state) {
Kokkos::Timer timer;
Kokkos::View<double*, Layout> a1(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
double* a1_ptr = a1.data();
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
Kokkos::fence();
KokkosBenchmark::report_results(state, a, 2, timer.seconds());
}
#endif
double size = 1.0 * N8 * 8 / 1024 / 1024;
printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size,
2.0 * size / 1024 / time_raw);
printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size,
2.0 * size / 1024 / time8);
printf(" Rank8 (WithoutInitializing): %lf s %lf MB %lf GB/s\n",
time8_noinit, size, 2.0 * size / 1024 / time8_noinit);
}
} // namespace Test

View File

@ -14,15 +14,80 @@
//
//@HEADER
#include <PerfTest_ViewResize.hpp>
#include "PerfTest_ViewResize.hpp"
namespace Test {
TEST(default_exec, ViewResize_Rank123) {
printf("Resize View Performance for LayoutLeft:\n");
run_resizeview_tests123<Kokkos::LayoutLeft>(10, 1);
printf("Resize View Performance for LayoutRight:\n");
run_resizeview_tests123<Kokkos::LayoutRight>(10, 1);
}
BENCHMARK(ViewResize_Rank1<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank1<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank2<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank2<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank3<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank3<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank1<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank1<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank2<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank2<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank3<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank3<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
} // namespace Test

View File

@ -14,15 +14,56 @@
//
//@HEADER
#include <PerfTest_ViewResize.hpp>
#include "PerfTest_ViewResize.hpp"
namespace Test {
TEST(default_exec, ViewResize_Rank_45) {
printf("Resize View Performance for LayoutLeft:\n");
run_resizeview_tests45<Kokkos::LayoutLeft>(10, 1);
printf("Resize View Performance for LayoutRight:\n");
run_resizeview_tests45<Kokkos::LayoutRight>(10, 1);
}
BENCHMARK(ViewResize_Rank4<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank4<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank5<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank5<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank4<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank4<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank5<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank5<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
} // namespace Test

View File

@ -14,15 +14,32 @@
//
//@HEADER
#include <PerfTest_ViewResize.hpp>
#include "PerfTest_ViewResize.hpp"
namespace Test {
TEST(default_exec, ViewResize_Rank6) {
printf("Resize View Performance for LayoutLeft:\n");
run_resizeview_tests6<Kokkos::LayoutLeft>(10, 1);
printf("Resize View Performance for LayoutRight:\n");
run_resizeview_tests6<Kokkos::LayoutRight>(10, 1);
}
BENCHMARK(ViewResize_Rank6<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank6<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank6<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank6<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
} // namespace Test

View File

@ -14,15 +14,32 @@
//
//@HEADER
#include <PerfTest_ViewResize.hpp>
#include "PerfTest_ViewResize.hpp"
namespace Test {
TEST(default_exec, ViewResize_Rank7) {
printf("Resize View Performance for LayoutLeft:\n");
run_resizeview_tests7<Kokkos::LayoutLeft>(10, 1);
printf("Resize View Performance for LayoutRight:\n");
run_resizeview_tests7<Kokkos::LayoutRight>(10, 1);
}
BENCHMARK(ViewResize_Rank7<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank7<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank7<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank7<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
} // namespace Test

View File

@ -14,23 +14,39 @@
//
//@HEADER
#include <PerfTest_ViewResize.hpp>
#include "PerfTest_ViewResize.hpp"
namespace Test {
TEST(default_exec, ViewResize_Rank8) {
// FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI
#ifdef KOKKOS_ENABLE_SYCL
printf("Resize View Performance for LayoutLeft:\n");
run_resizeview_tests8<Kokkos::LayoutLeft>(9, 1);
printf("Resize View Performance for LayoutRight:\n");
run_resizeview_tests8<Kokkos::LayoutRight>(9, 1);
static constexpr int N_8 = N - 1;
#else
printf("Resize View Performance for LayoutLeft:\n");
run_resizeview_tests8<Kokkos::LayoutLeft>(10, 1);
printf("Resize View Performance for LayoutRight:\n");
run_resizeview_tests8<Kokkos::LayoutRight>(10, 1);
static constexpr int N_8 = N;
#endif
}
BENCHMARK(ViewResize_Rank8<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N_8)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_Rank8<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N_8)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank8<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N_8)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Rank8<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N_8)
->UseManualTime()
->Iterations(R);
} // namespace Test

View File

@ -0,0 +1,35 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#include "PerfTest_ViewResize.hpp"
namespace Test {
#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
BENCHMARK(ViewResize_NoInit_Raw<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
BENCHMARK(ViewResize_NoInit_Raw<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime()
->Iterations(R);
#endif
} // namespace Test

View File

@ -18,38 +18,14 @@
#include <cstring>
#include <cstdlib>
#include <benchmark/benchmark.h>
#include "Benchmark_Context.hpp"
#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
using exec_space = Kokkos::DefaultExecutionSpace;
#define RESET 0
#define BRIGHT 1
#define DIM 2
#define UNDERLINE 3
#define BLINK 4
#define REVERSE 7
#define HIDDEN 8
#define BLACK 0
#define RED 1
#define GREEN 2
#define YELLOW 3
#define BLUE 4
#define MAGENTA 5
#define CYAN 6
#define GREY 7
#define WHITE 8
void textcolor(int attr, int fg, int bg) {
char command[40];
/* Command is the control command to the terminal */
snprintf(command, 40, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
printf("%s", command);
}
void textcolor_standard() { textcolor(RESET, BLACK, WHITE); }
template <class T, class DEVICE_TYPE>
struct ZeroFunctor {
using execution_space = DEVICE_TYPE;
@ -370,7 +346,9 @@ T LoopVariantNonAtomic(int loop, int test) {
}
template <class T>
void Loop(int loop, int test, const char* type_name) {
void Loop(benchmark::State& state, int test) {
int loop = state.range(0);
LoopVariant<T>(loop, test);
Kokkos::Timer timer;
@ -388,86 +366,36 @@ void Loop(int loop, int test, const char* type_name) {
time *= 1e6 / loop;
timeNonAtomic *= 1e6 / loop;
timeSerial *= 1e6 / loop;
// textcolor_standard();
bool passed = true;
if (resSerial != res) passed = false;
// if(!passed) textcolor(RESET,BLACK,YELLOW);
printf(
"%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e "
"%7.4e Size of Type %i)",
type_name, test, passed ? "PASSED" : "FAILED", loop, 1.0 * resSerial,
1.0 * res, 1.0 * resNonAtomic, timeSerial, time, timeNonAtomic,
(int)sizeof(T));
// if(!passed) textcolor_standard();
printf("\n");
bool passed = (resSerial == res);
state.counters["Passed"] = benchmark::Counter(passed);
state.counters["Value serial"] = benchmark::Counter(resSerial);
state.counters["Value atomic"] = benchmark::Counter(res);
state.counters["Value non-atomic"] = benchmark::Counter(resNonAtomic);
state.counters["Time serial"] = benchmark::Counter(timeSerial);
state.counters["Time atomic"] = benchmark::Counter(time);
state.counters["Time non-atomic"] = benchmark::Counter(timeNonAtomic);
state.counters["Size of type"] = benchmark::Counter(sizeof(T));
}
template <class T>
void Test(int loop, int test, const char* type_name) {
if (test == -1) {
Loop<T>(loop, 1, type_name);
Loop<T>(loop, 2, type_name);
Loop<T>(loop, 3, type_name);
} else
Loop<T>(loop, test, type_name);
static void Test_Atomic(benchmark::State& state) {
for (auto _ : state) {
Loop<T>(state, 1);
Loop<T>(state, 2);
Loop<T>(state, 3);
}
}
int main(int argc, char* argv[]) {
int type = -1;
int loop = 100000;
int test = -1;
static constexpr int LOOP = 100'000;
for (int i = 0; i < argc; i++) {
if ((strcmp(argv[i], "--test") == 0)) {
test = std::stoi(argv[++i]);
continue;
}
if ((strcmp(argv[i], "--type") == 0)) {
type = std::stoi(argv[++i]);
continue;
}
if ((strcmp(argv[i], "-l") == 0) || (strcmp(argv[i], "--loop") == 0)) {
loop = std::stoi(argv[++i]);
continue;
}
}
Kokkos::initialize(argc, argv);
printf("Using %s\n", Kokkos::atomic_query_version());
bool all_tests = false;
if (type == -1) all_tests = true;
while (type < 100) {
if (type == 1) {
Test<int>(loop, test, "int ");
}
if (type == 2) {
Test<long int>(loop, test, "long int ");
}
if (type == 3) {
Test<long long int>(loop, test, "long long int ");
}
if (type == 4) {
Test<unsigned int>(loop, test, "unsigned int ");
}
if (type == 5) {
Test<unsigned long int>(loop, test, "unsigned long int ");
}
if (type == 6) {
Test<unsigned long long int>(loop, test, "unsigned long long int ");
}
if (type == 10) {
// Test<float>(loop,test,"float ");
}
if (type == 11) {
Test<double>(loop, test, "double ");
}
if (!all_tests)
type = 100;
else
type++;
}
Kokkos::finalize();
}
BENCHMARK(Test_Atomic<int>)->Arg(LOOP)->Iterations(10);
BENCHMARK(Test_Atomic<long int>)->Arg(LOOP)->Iterations(10);
BENCHMARK(Test_Atomic<long long int>)->Arg(LOOP)->Iterations(10);
BENCHMARK(Test_Atomic<unsigned int>)->Arg(LOOP)->Iterations(10);
BENCHMARK(Test_Atomic<unsigned long int>)->Arg(LOOP)->Iterations(10);
BENCHMARK(Test_Atomic<unsigned long long int>)->Arg(LOOP)->Iterations(10);
BENCHMARK(Test_Atomic<float>)->Arg(LOOP)->Iterations(10);
BENCHMARK(Test_Atomic<double>)->Arg(LOOP)->Iterations(10);
BENCHMARK(Test_Atomic<int>)->Arg(LOOP)->Iterations(10);

View File

@ -21,240 +21,536 @@
// core/src/libkokkoscore.a -ldl && OMP_NUM_THREADS=1
// ./test_atomic_minmax_simple.x 10000000
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <typeinfo>
#include <benchmark/benchmark.h>
#include "Benchmark_Context.hpp"
#include "PerfTest_Category.hpp"
#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
using exec_space = Kokkos::DefaultExecutionSpace;
constexpr int LENGTH = 1'000'000;
template <typename T>
void test(const int length) {
Kokkos::View<T*, exec_space> prepare_input(const int length, const T value) {
Kokkos::View<T*, exec_space> input("input", length);
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) { input(i) = value; });
Kokkos::fence();
return input;
}
int get_length(benchmark::State& state) {
return (Test::command_line_num_args() == 2)
? std::stoi(Test::command_line_arg(1))
: state.range(0);
}
template <typename T>
int check_errors_replacement(Kokkos::View<T*, exec_space> view) {
int errors = 0;
Kokkos::parallel_reduce(
view.size(),
KOKKOS_LAMBDA(const int i, int& inner) { inner += (view(i) != (T)i); },
errors);
Kokkos::fence();
return errors;
}
template <typename T>
double atomic_min_replacement(Kokkos::View<T*, exec_space> input) {
const int length = input.size();
Kokkos::Timer timer;
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) {
(void)Kokkos::atomic_fetch_min(&(input(i)), (T)i);
});
Kokkos::fence();
return timer.seconds();
}
using vector = Kokkos::View<T*, exec_space>;
template <typename T>
static void Atomic_MinReplacements(benchmark::State& state) {
const int length = get_length(state);
auto inp = prepare_input(length, std::numeric_limits<T>::max());
vector inp("input", length);
T max = std::numeric_limits<T>::max();
T min = std::numeric_limits<T>::lowest();
for (auto _ : state) {
const auto time = atomic_min_replacement(inp);
const auto errors = check_errors_replacement(inp);
// input is max values - all min atomics will replace
{
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) { inp(i) = max; });
Kokkos::fence();
timer.reset();
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) {
(void)Kokkos::atomic_fetch_min(&(inp(i)), (T)i);
});
Kokkos::fence();
double time = timer.seconds();
int errors(0);
Kokkos::parallel_reduce(
length,
KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); },
errors);
Kokkos::fence();
if (errors) {
std::cerr << "Error in 100% min replacements: " << errors << std::endl;
std::cerr << "inp(0)=" << inp(0) << std::endl;
// report results
state.SetIterationTime(time);
if (errors > 0) {
state.counters["Errors"] = benchmark::Counter(errors);
}
std::cout << "Time for 100% min replacements: " << time << std::endl;
}
// input is min values - all max atomics will replace
{
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) { inp(i) = min; });
Kokkos::fence();
timer.reset();
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) {
(void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i);
});
Kokkos::fence();
double time = timer.seconds();
int errors(0);
Kokkos::parallel_reduce(
length,
KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); },
errors);
Kokkos::fence();
if (errors) {
std::cerr << "Error in 100% max replacements: " << errors << std::endl;
std::cerr << "inp(0)=" << inp(0) << std::endl;
}
std::cout << "Time for 100% max replacements: " << time << std::endl;
}
// input is max values - all max atomics will early exit
{
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) { inp(i) = max; });
Kokkos::fence();
timer.reset();
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) {
(void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i);
});
Kokkos::fence();
double time = timer.seconds();
int errors(0);
Kokkos::parallel_reduce(
length,
KOKKOS_LAMBDA(const int i, int& inner) {
T ref = max;
inner += (inp(i) != ref);
},
errors);
Kokkos::fence();
if (errors) {
std::cerr << "Error in 100% max early exits: " << errors << std::endl;
std::cerr << "inp(0)=" << inp(0) << std::endl;
}
std::cout << "Time for 100% max early exits: " << time << std::endl;
}
// input is min values - all min atomics will early exit
{
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) { inp(i) = min; });
Kokkos::fence();
timer.reset();
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) {
(void)Kokkos::atomic_min_fetch(&(inp(i)), (T)i);
});
Kokkos::fence();
double time = timer.seconds();
int errors(0);
Kokkos::parallel_reduce(
length,
KOKKOS_LAMBDA(const int i, int& inner) {
T ref = min;
inner += (inp(i) != ref);
},
errors);
Kokkos::fence();
if (errors) {
std::cerr << "Error in 100% min early exits: " << errors << std::endl;
std::cerr << "inp(0)=" << inp(0) << std::endl;
if (length > 9) std::cout << "inp(9)=" << inp(9) << std::endl;
}
std::cout << "Time for 100% min early exits: " << time << std::endl;
}
// limit iterations for contentious test, takes ~50x longer for same length
auto con_length = length / 5;
// input is min values - some max atomics will replace
{
Kokkos::parallel_for(
1, KOKKOS_LAMBDA(const int i) { inp(i) = min; });
Kokkos::fence();
T current(0);
timer.reset();
Kokkos::parallel_reduce(
con_length,
KOKKOS_LAMBDA(const int i, T& inner) {
inner = Kokkos::atomic_max_fetch(&(inp(0)), inner + 1);
if (i == con_length - 1) {
Kokkos::atomic_max_fetch(&(inp(0)), max);
inner = max;
}
},
Kokkos::Max<T>(current));
Kokkos::fence();
double time = timer.seconds();
if (current < max) {
std::cerr << "Error in contentious max replacements: " << std::endl;
std::cerr << "final=" << current << " inp(0)=" << inp(0) << " max=" << max
<< std::endl;
}
std::cout << "Time for contentious max " << con_length
<< " replacements: " << time << std::endl;
}
// input is max values - some min atomics will replace
{
Kokkos::parallel_for(
1, KOKKOS_LAMBDA(const int i) { inp(i) = max; });
Kokkos::fence();
timer.reset();
T current(100000000);
Kokkos::parallel_reduce(
con_length,
KOKKOS_LAMBDA(const int i, T& inner) {
inner = Kokkos::atomic_min_fetch(&(inp(0)), inner - 1);
if (i == con_length - 1) {
Kokkos::atomic_min_fetch(&(inp(0)), min);
inner = min;
}
},
Kokkos::Min<T>(current));
Kokkos::fence();
double time = timer.seconds();
if (current > min) {
std::cerr << "Error in contentious min replacements: " << std::endl;
std::cerr << "final=" << current << " inp(0)=" << inp(0) << " min=" << min
<< std::endl;
}
std::cout << "Time for contentious min " << con_length
<< " replacements: " << time << std::endl;
}
}
int main(int argc, char* argv[]) {
Kokkos::initialize(argc, argv);
{
int length = 1000000;
if (argc == 2) {
length = std::stoi(argv[1]);
}
if (length < 1) {
throw std::invalid_argument("");
}
std::cout << "================ int" << std::endl;
test<int>(length);
std::cout << "================ long" << std::endl;
test<long>(length);
std::cout << "================ long long" << std::endl;
test<long long>(length);
std::cout << "================ unsigned int" << std::endl;
test<unsigned int>(length);
std::cout << "================ unsigned long" << std::endl;
test<unsigned long>(length);
std::cout << "================ unsigned long long" << std::endl;
test<unsigned long long>(length);
std::cout << "================ float" << std::endl;
test<float>(length);
std::cout << "================ double" << std::endl;
test<double>(length);
}
Kokkos::finalize();
return 0;
template <typename T>
double atomic_max_replacement(Kokkos::View<T*, exec_space> input) {
const int length = input.size();
Kokkos::Timer timer;
Kokkos::parallel_for(
length, KOKKOS_LAMBDA(const int i) {
(void)Kokkos::atomic_max_fetch(&(input(i)), (T)i);
});
Kokkos::fence();
return timer.seconds();
}
template <typename T>
static void Atomic_MaxReplacements(benchmark::State& state) {
const auto length = get_length(state);
auto inp = prepare_input(length, std::numeric_limits<T>::lowest());
for (auto _ : state) {
const auto time = atomic_max_replacement(inp);
const auto errors = check_errors_replacement(inp);
// report results
state.SetIterationTime(time);
if (errors > 0) {
state.counters["Errors"] = benchmark::Counter(errors);
}
}
}
template <typename T>
int check_errors_early_exit(Kokkos::View<T*, exec_space> view, const T ref) {
int errors = 0;
Kokkos::parallel_reduce(
view.size(),
KOKKOS_LAMBDA(const int i, int& inner) { inner += (view(i) != ref); },
errors);
Kokkos::fence();
return errors;
}
template <typename T>
static void Atomic_MaxEarlyExits(benchmark::State& state) {
const auto length = get_length(state);
auto inp = prepare_input(length, std::numeric_limits<T>::max());
for (auto _ : state) {
const auto time = atomic_max_replacement(inp);
const auto errors =
check_errors_early_exit(inp, std::numeric_limits<T>::max());
// report results
state.SetIterationTime(time);
if (errors > 0) {
state.counters["Errors"] = benchmark::Counter(errors);
}
}
}
template <typename T>
static void Atomic_MinEarlyExits(benchmark::State& state) {
const auto length = get_length(state);
auto inp = prepare_input(length, std::numeric_limits<T>::lowest());
for (auto _ : state) {
const auto time = atomic_min_replacement(inp);
const auto errors =
check_errors_early_exit(inp, std::numeric_limits<T>::lowest());
// report results
state.SetIterationTime(time);
if (errors > 0) {
state.counters["Errors"] = benchmark::Counter(errors);
}
}
}
template <typename T>
void report_errors_contentious_replacement(benchmark::State& state,
const T final, const T first,
const T expected) {
state.counters["Errors"] = benchmark::Counter(1);
state.counters["Final"] = benchmark::Counter(final);
state.counters["First"] = benchmark::Counter(first);
state.counters["Expected"] = benchmark::Counter(expected);
}
template <typename T>
double atomic_contentious_max_replacement(benchmark::State& state,
Kokkos::View<T*, exec_space> input,
const int con_length) {
const auto max = std::numeric_limits<T>::max();
T current = 0;
Kokkos::Timer timer;
Kokkos::parallel_reduce(
con_length,
KOKKOS_LAMBDA(const int i, T& inner) {
inner = Kokkos::atomic_max_fetch(&(input(0)), inner + 1);
if (i == con_length - 1) {
Kokkos::atomic_max_fetch(&(input(0)), max);
inner = max;
}
},
Kokkos::Max<T>(current));
Kokkos::fence();
const auto time = timer.seconds();
if (current < max) {
report_errors_contentious_replacement(state, current, input(0), max);
}
return time;
}
template <typename T>
static void Atomic_ContentiousMaxReplacements(benchmark::State& state) {
const auto length = get_length(state);
auto inp = prepare_input(1, std::numeric_limits<T>::lowest());
for (auto _ : state) {
const auto time = atomic_contentious_max_replacement(state, inp, length);
state.SetIterationTime(time);
}
}
template <typename T>
double atomic_contentious_min_replacement(benchmark::State& state,
Kokkos::View<T*, exec_space> input,
const int con_length) {
const auto min = std::numeric_limits<T>::lowest();
T current = 0;
Kokkos::Timer timer;
Kokkos::parallel_reduce(
con_length,
KOKKOS_LAMBDA(const int i, T& inner) {
inner = Kokkos::atomic_min_fetch(&(input(0)), inner - 1);
if (i == con_length - 1) {
Kokkos::atomic_min_fetch(&(input(0)), min);
inner = min;
}
},
Kokkos::Min<T>(current));
Kokkos::fence();
const auto time = timer.seconds();
if (current > min) {
report_errors_contentious_replacement(state, current, input(0), min);
}
return time;
}
template <typename T>
static void Atomic_ContentiousMinReplacements(benchmark::State& state) {
const auto length = get_length(state);
auto inp = prepare_input(1, std::numeric_limits<T>::max());
for (auto _ : state) {
const auto time = atomic_contentious_max_replacement(state, inp, length);
state.SetIterationTime(time);
}
}
// int
BENCHMARK(Atomic_MinReplacements<int>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxReplacements<int>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxEarlyExits<int>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MinEarlyExits<int>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMaxReplacements<int>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMinReplacements<int>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
///////////////////////////////////////////////////////////////////////
// long
BENCHMARK(Atomic_MinReplacements<long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxReplacements<long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxEarlyExits<long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MinEarlyExits<long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMaxReplacements<long>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMinReplacements<long>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
///////////////////////////////////////////////////////////////////////
// long long
BENCHMARK(Atomic_MinReplacements<long long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxReplacements<long long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxEarlyExits<long long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MinEarlyExits<long long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMaxReplacements<long long>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMinReplacements<long long>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
///////////////////////////////////////////////////////////////////////
// unsigned int
BENCHMARK(Atomic_MinReplacements<unsigned int>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxReplacements<unsigned int>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxEarlyExits<unsigned int>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MinEarlyExits<unsigned int>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMaxReplacements<unsigned int>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMinReplacements<unsigned int>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
///////////////////////////////////////////////////////////////////////
// unsigned long
BENCHMARK(Atomic_MinReplacements<unsigned long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxReplacements<unsigned long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxEarlyExits<unsigned long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MinEarlyExits<unsigned long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMaxReplacements<unsigned long>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMinReplacements<unsigned long>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
///////////////////////////////////////////////////////////////////////
// unsigned long long
BENCHMARK(Atomic_MinReplacements<unsigned long long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxReplacements<unsigned long long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxEarlyExits<unsigned long long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MinEarlyExits<unsigned long long>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMaxReplacements<unsigned long long>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMinReplacements<unsigned long long>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
///////////////////////////////////////////////////////////////////////
// float
BENCHMARK(Atomic_MinReplacements<float>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxReplacements<float>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxEarlyExits<float>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MinEarlyExits<float>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMaxReplacements<float>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMinReplacements<float>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
///////////////////////////////////////////////////////////////////////
// double
BENCHMARK(Atomic_MinReplacements<double>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxReplacements<double>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MaxEarlyExits<double>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_MinEarlyExits<double>)
->ArgName("Length")
->Arg(LENGTH)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMaxReplacements<double>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);
BENCHMARK(Atomic_ContentiousMinReplacements<double>)
->ArgName("Length")
->Arg(LENGTH / 5)
->UseManualTime()
->Iterations(10);

View File

@ -19,9 +19,13 @@
#include <cstdlib>
#include <limits>
#include <benchmark/benchmark.h>
#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
#include "Benchmark_Context.hpp"
#include "PerfTest_Category.hpp"
using ExecSpace = Kokkos::DefaultExecutionSpace;
using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
@ -146,53 +150,8 @@ struct TestFunctor {
}
};
int main(int argc, char* argv[]) {
static const char help_flag[] = "--help";
static const char alloc_size_flag[] = "--alloc_size=";
static const char super_size_flag[] = "--super_size=";
static const char chunk_span_flag[] = "--chunk_span=";
static const char fill_stride_flag[] = "--fill_stride=";
static const char fill_level_flag[] = "--fill_level=";
static const char repeat_outer_flag[] = "--repeat_outer=";
static const char repeat_inner_flag[] = "--repeat_inner=";
long total_alloc_size = 1000000;
int min_superblock_size = 10000;
int chunk_span = 5;
int fill_stride = 1;
int fill_level = 70;
int repeat_outer = 1;
int repeat_inner = 1;
int ask_help = 0;
for (int i = 1; i < argc; i++) {
const char* const a = argv[i];
if (!strncmp(a, help_flag, strlen(help_flag))) ask_help = 1;
if (!strncmp(a, alloc_size_flag, strlen(alloc_size_flag)))
total_alloc_size = atol(a + strlen(alloc_size_flag));
if (!strncmp(a, super_size_flag, strlen(super_size_flag)))
min_superblock_size = std::stoi(a + strlen(super_size_flag));
if (!strncmp(a, fill_stride_flag, strlen(fill_stride_flag)))
fill_stride = std::stoi(a + strlen(fill_stride_flag));
if (!strncmp(a, fill_level_flag, strlen(fill_level_flag)))
fill_level = std::stoi(a + strlen(fill_level_flag));
if (!strncmp(a, chunk_span_flag, strlen(chunk_span_flag)))
chunk_span = std::stoi(a + strlen(chunk_span_flag));
if (!strncmp(a, repeat_outer_flag, strlen(repeat_outer_flag)))
repeat_outer = std::stoi(a + strlen(repeat_outer_flag));
if (!strncmp(a, repeat_inner_flag, strlen(repeat_inner_flag)))
repeat_inner = std::stoi(a + strlen(repeat_inner_flag));
}
int get_number_alloc(int chunk_span, int min_superblock_size,
long total_alloc_size, int fill_level) {
int chunk_span_bytes = 0;
for (int i = 0; i < chunk_span; ++i) {
auto chunk_bytes = TestFunctor::chunk * (1 + i);
@ -212,81 +171,85 @@ int main(int argc, char* argv[]) {
auto bytes_wanted = (actual_total_bytes * fill_level) / 100;
auto chunk_spans = bytes_wanted / chunk_span_bytes;
auto number_alloc = int(chunk_spans * chunk_span);
return number_alloc;
}
if (ask_help) {
std::cout << "command line options:"
<< " " << help_flag << " " << alloc_size_flag << "##"
<< " " << super_size_flag << "##"
<< " " << fill_stride_flag << "##"
<< " " << fill_level_flag << "##"
<< " " << chunk_span_flag << "##"
<< " " << repeat_outer_flag << "##"
<< " " << repeat_inner_flag << "##" << std::endl;
return 0;
template <class T>
T get_parameter(const char flag[], T default_value) {
auto argc = Test::command_line_num_args();
auto value = default_value;
for (int i = 1; i < argc; i++) {
const char* const a = Test::command_line_arg(i);
if (!strncmp(a, flag, strlen(flag))) value = std::stoi(a + strlen(flag));
}
Kokkos::initialize(argc, argv);
return value;
}
double sum_fill_time = 0;
double sum_cycle_time = 0;
double sum_both_time = 0;
double min_fill_time = std::numeric_limits<double>::max();
double min_cycle_time = std::numeric_limits<double>::max();
double min_both_time = std::numeric_limits<double>::max();
// one alloc in fill, alloc/dealloc pair in repeat_inner
for (int i = 0; i < repeat_outer; ++i) {
static void Mempool_Fill(benchmark::State& state) {
long total_alloc_size =
get_parameter("--alloc_size=", static_cast<long>(state.range(0)));
int min_superblock_size = get_parameter("--super_size=", state.range(1));
int chunk_span = get_parameter("--chunk_span=", state.range(2));
int fill_stride = get_parameter("--fill_stride=", state.range(3));
int fill_level = get_parameter("--fill_level=", state.range(4));
int repeat_inner = get_parameter("--repeat_inner=", state.range(5));
int number_alloc = get_number_alloc(chunk_span, min_superblock_size,
total_alloc_size, fill_level);
for (auto _ : state) {
TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
fill_stride, chunk_span, repeat_inner);
Kokkos::Timer timer;
if (!functor.test_fill()) {
Kokkos::abort("fill ");
}
auto t0 = timer.seconds();
state.SetIterationTime(timer.seconds());
state.counters[KokkosBenchmark::benchmark_fom("fill ops per second")] =
benchmark::Counter(number_alloc,
benchmark::Counter::kIsIterationInvariantRate);
}
}
static void Mempool_Alloc_Dealloc(benchmark::State& state) {
long total_alloc_size =
get_parameter("--alloc_size=", static_cast<long>(state.range(0)));
int min_superblock_size = get_parameter("--super_size=", state.range(1));
int chunk_span = get_parameter("--chunk_span=", state.range(2));
int fill_stride = get_parameter("--fill_stride=", state.range(3));
int fill_level = get_parameter("--fill_level=", state.range(4));
int repeat_inner = get_parameter("--repeat_inner=", state.range(5));
int number_alloc = get_number_alloc(chunk_span, min_superblock_size,
total_alloc_size, fill_level);
for (auto _ : state) {
TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
fill_stride, chunk_span, repeat_inner);
Kokkos::Timer timer;
if (!functor.test_alloc_dealloc()) {
Kokkos::abort("alloc/dealloc ");
}
auto t1 = timer.seconds();
auto this_fill_time = t0;
auto this_cycle_time = t1 - t0;
auto this_both_time = t1;
sum_fill_time += this_fill_time;
sum_cycle_time += this_cycle_time;
sum_both_time += this_both_time;
min_fill_time = std::min(min_fill_time, this_fill_time);
min_cycle_time = std::min(min_cycle_time, this_cycle_time);
min_both_time = std::min(min_both_time, this_both_time);
state.SetIterationTime(timer.seconds());
state.counters[KokkosBenchmark::benchmark_fom("cycle ops per second")] =
benchmark::Counter(2 * number_alloc * repeat_inner,
benchmark::Counter::kIsIterationInvariantRate);
}
Kokkos::finalize();
printf(
"\"mempool: alloc super stride level span inner outer number\" %ld %d %d "
"%d %d %d %d %d\n",
total_alloc_size, min_superblock_size, fill_stride, fill_level,
chunk_span, repeat_inner, repeat_outer, number_alloc);
auto avg_fill_time = sum_fill_time / repeat_outer;
auto avg_cycle_time = sum_cycle_time / repeat_outer;
auto avg_both_time = sum_both_time / repeat_outer;
printf("\"mempool: fill time (min, avg)\" %.8f %.8f\n", min_fill_time,
avg_fill_time);
printf("\"mempool: cycle time (min, avg)\" %.8f %.8f\n", min_cycle_time,
avg_cycle_time);
printf("\"mempool: test time (min, avg)\" %.8f %.8f\n", min_both_time,
avg_both_time);
printf("\"mempool: fill ops per second (max, avg)\" %g %g\n",
number_alloc / min_fill_time, number_alloc / avg_fill_time);
printf("\"mempool: cycle ops per second (max, avg)\" %g %g\n",
(2 * number_alloc * repeat_inner) / min_cycle_time,
(2 * number_alloc * repeat_inner) / avg_cycle_time);
}
const std::vector<std::string> ARG_NAMES = {
"total_alloc_size", "min_superblock_size", "chunk_span",
"fill_stride", "fill_level", "repeat_inner"};
const std::vector<int64_t> ARGS = {1'000'000, 10'000, 5, 1, 70, 1};
BENCHMARK(Mempool_Fill)->ArgNames(ARG_NAMES)->Args(ARGS)->UseManualTime();
BENCHMARK(Mempool_Alloc_Dealloc)
->ArgNames(ARG_NAMES)
->Args(ARGS)
->UseManualTime();