Update Kokkos library in LAMMPS to v4.0
This commit is contained in:
218
lib/kokkos/core/unit_test/TestSharedSpace.cpp
Normal file
218
lib/kokkos/core/unit_test/TestSharedSpace.cpp
Normal file
@ -0,0 +1,218 @@
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 4.0
|
||||
// Copyright (2022) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://kokkos.org/LICENSE for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//@HEADER
|
||||
#include <gtest/gtest.h>
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
unsigned getBytesPerPage() {
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si);
|
||||
return si.dwPageSize;
|
||||
}
|
||||
|
||||
#else // unix/posix system
|
||||
#include <unistd.h>
|
||||
unsigned getBytesPerPage() { return sysconf(_SC_PAGESIZE); }
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <iostream>
|
||||
|
||||
namespace {
|
||||
void printTimings(std::ostream& out, std::vector<uint64_t> const& tr,
|
||||
uint64_t threshold = (std::numeric_limits<uint64_t>::max)()) {
|
||||
out << "TimingResult contains " << tr.size() << " results:\n";
|
||||
for (auto it = tr.begin(); it != tr.end(); ++it) {
|
||||
out << "Duration of loop " << it - tr.begin() << " is " << *it
|
||||
<< " clock cycles. ";
|
||||
if ((*it) > threshold) out << "Migration assumed.";
|
||||
|
||||
out << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T computeMean(std::vector<T> const& results) {
|
||||
return std::accumulate(results.begin(), results.end(), T{}) / results.size();
|
||||
}
|
||||
|
||||
template <typename ViewType>
|
||||
class IncrementFunctor {
|
||||
private:
|
||||
using index_type = decltype(std::declval<ViewType>().size());
|
||||
ViewType view_;
|
||||
|
||||
public:
|
||||
IncrementFunctor() = delete;
|
||||
|
||||
explicit IncrementFunctor(ViewType view) : view_(view) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const index_type idx, uint64_t& clockTics) const {
|
||||
uint64_t start = Kokkos::Impl::clock_tic();
|
||||
++view_(idx);
|
||||
clockTics += Kokkos::Impl::clock_tic() - start;
|
||||
}
|
||||
};
|
||||
|
||||
// TIMING CAPTURED KERNEL
|
||||
// PREMISE: This kernel should always be memory bound, as we are measuring
|
||||
// memory access times. The compute load of an increment is small enough on
|
||||
// current hardware but this could be different for new hardware. As we count
|
||||
// the clocks in the kernel, the core frequency of the device has to be fast
|
||||
// enough to guarante that the kernel stays memory bound.
|
||||
template <typename ExecSpace, typename ViewType>
|
||||
std::vector<uint64_t> incrementInLoop(ViewType& view,
|
||||
unsigned int numRepetitions) {
|
||||
using index_type = decltype(view.size());
|
||||
std::vector<uint64_t> results;
|
||||
|
||||
Kokkos::fence();
|
||||
for (unsigned i = 0; i < numRepetitions; ++i) {
|
||||
uint64_t sum_clockTics;
|
||||
IncrementFunctor<ViewType> func(view);
|
||||
Kokkos::parallel_reduce(
|
||||
"increment",
|
||||
Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<index_type>>{
|
||||
0, view.size()},
|
||||
func, sum_clockTics);
|
||||
Kokkos::fence();
|
||||
results.push_back(sum_clockTics / view.size());
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
TEST(defaultdevicetype, shared_space) {
|
||||
ASSERT_TRUE(Kokkos::has_shared_space);
|
||||
|
||||
if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace,
|
||||
Kokkos::DefaultHostExecutionSpace>)
|
||||
GTEST_SKIP() << "Skipping as host and device are the same space";
|
||||
|
||||
#if defined(KOKKOS_ARCH_VEGA906) || defined(KOKKOS_ARCH_VEGA908) || \
|
||||
defined(KOKKOS_ARCH_NAVI)
|
||||
GTEST_SKIP()
|
||||
<< "skipping because specified arch does not support page migration";
|
||||
#endif
|
||||
#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)
|
||||
GTEST_SKIP()
|
||||
<< "skipping because clock_tic is only defined for sycl+intel gpu";
|
||||
#endif
|
||||
|
||||
const unsigned int numRepetitions = 10;
|
||||
const unsigned int numDeviceHostCycles = 3;
|
||||
double threshold = 1.5;
|
||||
unsigned int numPages = 100;
|
||||
size_t numBytes = numPages * getBytesPerPage();
|
||||
|
||||
using DeviceExecutionSpace = Kokkos::DefaultExecutionSpace;
|
||||
using HostExecutionSpace = Kokkos::DefaultHostExecutionSpace;
|
||||
|
||||
// ALLOCATION
|
||||
Kokkos::View<int*, Kokkos::SharedSpace> sharedData("sharedData",
|
||||
numBytes / sizeof(int));
|
||||
Kokkos::View<int*, DeviceExecutionSpace::memory_space> deviceData(
|
||||
"deviceData", numBytes / sizeof(int));
|
||||
Kokkos::View<int*, HostExecutionSpace::memory_space> hostData(
|
||||
"hostData", numBytes / sizeof(int));
|
||||
Kokkos::fence();
|
||||
|
||||
// GET DEFAULT EXECSPACE LOCAL TIMINGS
|
||||
auto deviceLocalTimings =
|
||||
incrementInLoop<DeviceExecutionSpace>(deviceData, numRepetitions);
|
||||
|
||||
// GET DEFAULT HOSTEXECSPACE LOCAL TIMINGS
|
||||
auto hostLocalTimings =
|
||||
incrementInLoop<HostExecutionSpace>(hostData, numRepetitions);
|
||||
|
||||
// GET PAGE MIGRATING TIMINGS DATA
|
||||
std::vector<decltype(deviceLocalTimings)> deviceSharedTimings{};
|
||||
std::vector<decltype(hostLocalTimings)> hostSharedTimings{};
|
||||
for (unsigned i = 0; i < numDeviceHostCycles; ++i) {
|
||||
// GET RESULTS DEVICE
|
||||
deviceSharedTimings.push_back(
|
||||
incrementInLoop<DeviceExecutionSpace>(sharedData, numRepetitions));
|
||||
|
||||
// GET RESULTS HOST
|
||||
hostSharedTimings.push_back(
|
||||
incrementInLoop<HostExecutionSpace>(sharedData, numRepetitions));
|
||||
}
|
||||
|
||||
// COMPUTE STATISTICS OF HOST AND DEVICE LOCAL KERNELS
|
||||
auto deviceLocalMean = computeMean(deviceLocalTimings);
|
||||
auto hostLocalMean = computeMean(hostLocalTimings);
|
||||
|
||||
// ASSESS RESULTS
|
||||
bool fastAsLocalOnRepeatedAccess = true;
|
||||
|
||||
for (unsigned cycle = 0; cycle < numDeviceHostCycles; ++cycle) {
|
||||
std::for_each(std::next(deviceSharedTimings[cycle].begin()),
|
||||
deviceSharedTimings[cycle].end(), [&](const uint64_t timing) {
|
||||
(timing < threshold * deviceLocalMean)
|
||||
? fastAsLocalOnRepeatedAccess &= true
|
||||
: fastAsLocalOnRepeatedAccess &= false;
|
||||
});
|
||||
|
||||
std::for_each(std::next(hostSharedTimings[cycle].begin()),
|
||||
hostSharedTimings[cycle].end(), [&](const uint64_t timing) {
|
||||
(timing < threshold * hostLocalMean)
|
||||
? fastAsLocalOnRepeatedAccess &= true
|
||||
: fastAsLocalOnRepeatedAccess &= false;
|
||||
});
|
||||
}
|
||||
|
||||
// CHECK IF PASSED
|
||||
bool passed = (fastAsLocalOnRepeatedAccess);
|
||||
|
||||
// PRINT IF NOT PASSED
|
||||
if (!passed) {
|
||||
std::cout << "Page size as reported by os: " << getBytesPerPage()
|
||||
<< " bytes \n";
|
||||
std::cout << "Allocating " << numPages
|
||||
<< " pages of memory in SharedSpace.\n";
|
||||
|
||||
std::cout << "Behavior found: \n";
|
||||
std::cout << "SharedSpace is as fast as local space on repeated access: "
|
||||
<< fastAsLocalOnRepeatedAccess << ", we expect true \n\n";
|
||||
|
||||
std::cout
|
||||
<< "Please look at the following timings. The first access in a "
|
||||
"different ExecutionSpace is not evaluated for the test. As we "
|
||||
"expect the memory to migrate during the first access it might have "
|
||||
"a higher cycle count than subsequent accesses, depending on your "
|
||||
"hardware. If the cycles are more than "
|
||||
<< threshold
|
||||
<< " times the cycles for pure local memory access, we assume a page "
|
||||
"migration happened.\n\n";
|
||||
|
||||
std::cout << "################SHARED SPACE####################\n";
|
||||
for (unsigned cycle = 0; cycle < numDeviceHostCycles; ++cycle) {
|
||||
std::cout << "DeviceExecutionSpace timings of run " << cycle << ":\n";
|
||||
printTimings(std::cout, deviceSharedTimings[cycle],
|
||||
threshold * deviceLocalMean);
|
||||
std::cout << "HostExecutionSpace timings of run " << cycle << ":\n";
|
||||
printTimings(std::cout, hostSharedTimings[cycle],
|
||||
threshold * hostLocalMean);
|
||||
}
|
||||
std::cout << "################LOCAL SPACE####################\n";
|
||||
printTimings(std::cout, deviceLocalTimings);
|
||||
printTimings(std::cout, hostLocalTimings);
|
||||
}
|
||||
ASSERT_TRUE(passed);
|
||||
}
|
||||
} // namespace
|
||||
Reference in New Issue
Block a user