Update Kokkos library in LAMMPS to v3.2
This commit is contained in:
@ -69,13 +69,13 @@ int main(int argc, char* argv[]) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int L = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
int M = atoi(argv[3]);
|
||||
int D = atoi(argv[4]);
|
||||
int K = atoi(argv[5]);
|
||||
int R = atoi(argv[6]);
|
||||
int type = atoi(argv[7]);
|
||||
int L = std::stoi(argv[1]);
|
||||
int N = std::stoi(argv[2]);
|
||||
int M = std::stoi(argv[3]);
|
||||
int D = std::stoi(argv[4]);
|
||||
int K = std::stoi(argv[5]);
|
||||
int R = std::stoi(argv[6]);
|
||||
int type = std::stoi(argv[7]);
|
||||
|
||||
Kokkos::View<int*> offsets("Offsets", L, M);
|
||||
Kokkos::Random_XorShift64_Pool<> pool(12371);
|
||||
|
||||
@ -73,15 +73,15 @@ int main(int argc, char* argv[]) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int P = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
int K = atoi(argv[3]);
|
||||
int R = atoi(argv[4]);
|
||||
int D = atoi(argv[5]);
|
||||
int U = atoi(argv[6]);
|
||||
int F = atoi(argv[7]);
|
||||
int T = atoi(argv[8]);
|
||||
int S = atoi(argv[9]);
|
||||
int P = std::stoi(argv[1]);
|
||||
int N = std::stoi(argv[2]);
|
||||
int K = std::stoi(argv[3]);
|
||||
int R = std::stoi(argv[4]);
|
||||
int D = std::stoi(argv[5]);
|
||||
int U = std::stoi(argv[6]);
|
||||
int F = std::stoi(argv[7]);
|
||||
int T = std::stoi(argv[8]);
|
||||
int S = std::stoi(argv[9]);
|
||||
|
||||
if (U > 8) {
|
||||
printf("U must be 1-8\n");
|
||||
|
||||
@ -72,13 +72,13 @@ int main(int argc, char* argv[]) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int S = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
int K = atoi(argv[3]);
|
||||
int D = atoi(argv[4]);
|
||||
int R = atoi(argv[5]);
|
||||
int U = atoi(argv[6]);
|
||||
int F = atoi(argv[7]);
|
||||
int S = std::stoi(argv[1]);
|
||||
int N = std::stoi(argv[2]);
|
||||
int K = std::stoi(argv[3]);
|
||||
int D = std::stoi(argv[4]);
|
||||
int R = std::stoi(argv[5]);
|
||||
int U = std::stoi(argv[6]);
|
||||
int F = std::stoi(argv[7]);
|
||||
|
||||
if ((S != 1) && (S != 2) && (S != 4)) {
|
||||
printf("S must be one of 1,2,4\n");
|
||||
|
||||
@ -50,151 +50,152 @@
|
||||
#define HLINE "-------------------------------------------------------------\n"
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror GUPSHostArray;
|
||||
typedef Kokkos::View<int64_t*, Kokkos::CudaSpace> GUPSDeviceArray;
|
||||
using GUPSHostArray = Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror;
|
||||
using GUPSDeviceArray = Kokkos::View<int64_t*, Kokkos::CudaSpace>;
|
||||
#else
|
||||
typedef Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror GUPSHostArray;
|
||||
typedef Kokkos::View<int64_t*, Kokkos::HostSpace> GUPSDeviceArray;
|
||||
using GUPSHostArray = Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror;
|
||||
using GUPSDeviceArray = Kokkos::View<int64_t*, Kokkos::HostSpace>;
|
||||
#endif
|
||||
|
||||
typedef int GUPSIndex;
|
||||
using GUPSIndex = int;
|
||||
|
||||
double now() {
|
||||
struct timeval now;
|
||||
gettimeofday(&now, nullptr);
|
||||
struct timeval now;
|
||||
gettimeofday(&now, nullptr);
|
||||
|
||||
return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
|
||||
return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
|
||||
}
|
||||
|
||||
void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices, const int64_t dataCount) {
|
||||
for( GUPSIndex i = 0; i < indices.extent(0); ++i ) {
|
||||
indices[i] = lrand48() % dataCount;
|
||||
}
|
||||
void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices,
|
||||
const int64_t dataCount) {
|
||||
for (GUPSIndex i = 0; i < indices.extent(0); ++i) {
|
||||
indices[i] = lrand48() % dataCount;
|
||||
}
|
||||
|
||||
Kokkos::deep_copy(dev_indices, indices);
|
||||
Kokkos::deep_copy(dev_indices, indices);
|
||||
}
|
||||
|
||||
void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data, const int64_t datum,
|
||||
const bool performAtomics) {
|
||||
void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data,
|
||||
const int64_t datum, const bool performAtomics) {
|
||||
if (performAtomics) {
|
||||
Kokkos::parallel_for(
|
||||
"bench-gups-atomic", indices.extent(0),
|
||||
KOKKOS_LAMBDA(const GUPSIndex i) {
|
||||
Kokkos::atomic_fetch_xor(&data[indices[i]], datum);
|
||||
});
|
||||
} else {
|
||||
Kokkos::parallel_for(
|
||||
"bench-gups-non-atomic", indices.extent(0),
|
||||
KOKKOS_LAMBDA(const GUPSIndex i) { data[indices[i]] ^= datum; });
|
||||
}
|
||||
|
||||
if( performAtomics ) {
|
||||
Kokkos::parallel_for("bench-gups-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
|
||||
Kokkos::atomic_fetch_xor( &data[indices[i]], datum );
|
||||
});
|
||||
} else {
|
||||
Kokkos::parallel_for("bench-gups-non-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
|
||||
data[indices[i]] ^= datum;
|
||||
});
|
||||
}
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount, const int repeats,
|
||||
const bool useAtomics) {
|
||||
int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount,
|
||||
const int repeats, const bool useAtomics) {
|
||||
printf("Reports fastest timing per kernel\n");
|
||||
printf("Creating Views...\n");
|
||||
|
||||
printf("Reports fastest timing per kernel\n");
|
||||
printf("Creating Views...\n");
|
||||
printf("Memory Sizes:\n");
|
||||
printf("- Elements: %15" PRIu64 " (%12.4f MB)\n",
|
||||
static_cast<uint64_t>(dataCount),
|
||||
1.0e-6 * ((double)dataCount * (double)sizeof(int64_t)));
|
||||
printf("- Indices: %15" PRIu64 " (%12.4f MB)\n",
|
||||
static_cast<uint64_t>(indicesCount),
|
||||
1.0e-6 * ((double)indicesCount * (double)sizeof(int64_t)));
|
||||
printf(" - Atomics: %15s\n", (useAtomics ? "Yes" : "No"));
|
||||
printf("Benchmark kernels will be performed for %d iterations.\n", repeats);
|
||||
|
||||
printf("Memory Sizes:\n");
|
||||
printf("- Elements: %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(dataCount),
|
||||
1.0e-6 * ((double) dataCount * (double) sizeof(int64_t)));
|
||||
printf("- Indices: %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(indicesCount),
|
||||
1.0e-6 * ((double) indicesCount * (double) sizeof(int64_t)));
|
||||
printf(" - Atomics: %15s\n", (useAtomics ? "Yes" : "No") );
|
||||
printf("Benchmark kernels will be performed for %d iterations.\n", repeats);
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
GUPSDeviceArray dev_indices("indices", indicesCount);
|
||||
GUPSDeviceArray dev_data("data", dataCount);
|
||||
int64_t datum = -1;
|
||||
|
||||
GUPSDeviceArray dev_indices("indices", indicesCount);
|
||||
GUPSDeviceArray dev_data("data", dataCount);
|
||||
int64_t datum = -1;
|
||||
GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices);
|
||||
GUPSHostArray data = Kokkos::create_mirror_view(dev_data);
|
||||
|
||||
GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices);
|
||||
GUPSHostArray data = Kokkos::create_mirror_view(dev_data);
|
||||
double gupsTime = 0.0;
|
||||
|
||||
double gupsTime = 0.0;
|
||||
|
||||
printf("Initializing Views...\n");
|
||||
printf("Initializing Views...\n");
|
||||
|
||||
#if defined(KOKKOS_HAVE_OPENMP)
|
||||
Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount),
|
||||
Kokkos::parallel_for(
|
||||
"init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount),
|
||||
#else
|
||||
Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount),
|
||||
Kokkos::parallel_for(
|
||||
"init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount),
|
||||
#endif
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
|
||||
data[i] = 10101010101;
|
||||
});
|
||||
KOKKOS_LAMBDA(const int i) { data[i] = 10101010101; });
|
||||
|
||||
#if defined(KOKKOS_HAVE_OPENMP)
|
||||
Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount),
|
||||
Kokkos::parallel_for(
|
||||
"init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount),
|
||||
#else
|
||||
Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount),
|
||||
Kokkos::parallel_for(
|
||||
"init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount),
|
||||
#endif
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
KOKKOS_LAMBDA(const int i) { indices[i] = 0; });
|
||||
|
||||
indices[i] = 0;
|
||||
});
|
||||
Kokkos::deep_copy(dev_data, data);
|
||||
Kokkos::deep_copy(dev_indices, indices);
|
||||
double start;
|
||||
|
||||
Kokkos::deep_copy(dev_data, data);
|
||||
Kokkos::deep_copy(dev_indices, indices);
|
||||
double start;
|
||||
printf("Starting benchmarking...\n");
|
||||
|
||||
printf("Starting benchmarking...\n");
|
||||
for (GUPSIndex k = 0; k < repeats; ++k) {
|
||||
randomize_indices(indices, dev_indices, data.extent(0));
|
||||
|
||||
for( GUPSIndex k = 0; k < repeats; ++k ) {
|
||||
randomize_indices(indices, dev_indices, data.extent(0));
|
||||
start = now();
|
||||
run_gups(dev_indices, dev_data, datum, useAtomics);
|
||||
gupsTime += now() - start;
|
||||
}
|
||||
|
||||
start = now();
|
||||
run_gups(dev_indices, dev_data, datum, useAtomics);
|
||||
gupsTime += now() - start;
|
||||
}
|
||||
Kokkos::deep_copy(indices, dev_indices);
|
||||
Kokkos::deep_copy(data, dev_data);
|
||||
|
||||
Kokkos::deep_copy(indices, dev_indices);
|
||||
Kokkos::deep_copy(data, dev_data);
|
||||
printf(HLINE);
|
||||
printf(
|
||||
"GUP/s Random: %18.6f\n",
|
||||
(1.0e-9 * ((double)repeats) * (double)dev_indices.extent(0)) / gupsTime);
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
printf("GUP/s Random: %18.6f\n",
|
||||
(1.0e-9 * ((double) repeats) * (double) dev_indices.extent(0)) / gupsTime);
|
||||
printf(HLINE);
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
printf(HLINE);
|
||||
printf("Kokkos GUPS Benchmark\n");
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
printf("Kokkos GUPS Benchmark\n");
|
||||
printf(HLINE);
|
||||
srand48(1010101);
|
||||
|
||||
srand48(1010101);
|
||||
Kokkos::initialize(argc, argv);
|
||||
|
||||
Kokkos::initialize(argc, argv);
|
||||
int64_t indices = 8192;
|
||||
int64_t data = 33554432;
|
||||
int64_t repeats = 10;
|
||||
bool useAtomics = false;
|
||||
|
||||
int64_t indices = 8192;
|
||||
int64_t data = 33554432;
|
||||
int64_t repeats = 10;
|
||||
bool useAtomics = false;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (strcmp(argv[i], "--indices") == 0) {
|
||||
indices = std::atoll(argv[i + 1]);
|
||||
++i;
|
||||
} else if (strcmp(argv[i], "--data") == 0) {
|
||||
data = std::atoll(argv[i + 1]);
|
||||
++i;
|
||||
} else if (strcmp(argv[i], "--repeats") == 0) {
|
||||
repeats = std::atoll(argv[i + 1]);
|
||||
++i;
|
||||
} else if (strcmp(argv[i], "--atomics") == 0) {
|
||||
useAtomics = true;
|
||||
}
|
||||
}
|
||||
|
||||
for( int i = 1; i < argc; ++i ) {
|
||||
if( strcmp( argv[i], "--indices" ) == 0 ) {
|
||||
indices = std::atoll(argv[i+1]);
|
||||
++i;
|
||||
} else if( strcmp( argv[i], "--data" ) == 0 ) {
|
||||
data = std::atoll(argv[i+1]);
|
||||
++i;
|
||||
} else if( strcmp( argv[i], "--repeats" ) == 0 ) {
|
||||
repeats = std::atoll(argv[i+1]);
|
||||
++i;
|
||||
} else if( strcmp( argv[i], "--atomics" ) == 0 ) {
|
||||
useAtomics = true;
|
||||
}
|
||||
}
|
||||
const int rc = run_benchmark(indices, data, repeats, useAtomics);
|
||||
|
||||
const int rc = run_benchmark(indices, data, repeats, useAtomics);
|
||||
Kokkos::finalize();
|
||||
|
||||
Kokkos::finalize();
|
||||
|
||||
return rc;
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -94,22 +94,22 @@ int main(int argc, char* argv[]) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int team_range = atoi(argv[1]);
|
||||
int thread_range = atoi(argv[2]);
|
||||
int vector_range = atoi(argv[3]);
|
||||
int team_range = std::stoi(argv[1]);
|
||||
int thread_range = std::stoi(argv[2]);
|
||||
int vector_range = std::stoi(argv[3]);
|
||||
|
||||
int outer_repeat = atoi(argv[4]);
|
||||
int thread_repeat = atoi(argv[5]);
|
||||
int vector_repeat = atoi(argv[6]);
|
||||
int outer_repeat = std::stoi(argv[4]);
|
||||
int thread_repeat = std::stoi(argv[5]);
|
||||
int vector_repeat = std::stoi(argv[6]);
|
||||
|
||||
int team_size = atoi(argv[7]);
|
||||
int vector_size = atoi(argv[8]);
|
||||
int schedule = atoi(argv[9]);
|
||||
int test_type = atoi(argv[10]);
|
||||
int team_size = std::stoi(argv[7]);
|
||||
int vector_size = std::stoi(argv[8]);
|
||||
int schedule = std::stoi(argv[9]);
|
||||
int test_type = std::stoi(argv[10]);
|
||||
|
||||
int disable_verbose_output = 0;
|
||||
if (argc > 11) {
|
||||
disable_verbose_output = atoi(argv[11]);
|
||||
disable_verbose_output = std::stoi(argv[11]);
|
||||
}
|
||||
|
||||
if (schedule != 1 && schedule != 2) {
|
||||
@ -138,9 +138,9 @@ int main(int argc, char* argv[]) {
|
||||
double& lval) { lval += 1; },
|
||||
result);
|
||||
|
||||
typedef Kokkos::View<double*, Kokkos::LayoutRight> view_type_1d;
|
||||
typedef Kokkos::View<double**, Kokkos::LayoutRight> view_type_2d;
|
||||
typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
|
||||
using view_type_1d = Kokkos::View<double*, Kokkos::LayoutRight>;
|
||||
using view_type_2d = Kokkos::View<double**, Kokkos::LayoutRight>;
|
||||
using view_type_3d = Kokkos::View<double***, Kokkos::LayoutRight>;
|
||||
|
||||
// Allocate view without initializing
|
||||
// Call a 'warmup' test with 1 repeat - this will initialize the corresponding
|
||||
|
||||
@ -68,8 +68,8 @@ void test_policy(int team_range, int thread_range, int vector_range,
|
||||
int team_size, int vector_size, int test_type, ViewType1& v1,
|
||||
ViewType2& v2, ViewType3& v3, double& result,
|
||||
double& result_expect, double& time) {
|
||||
typedef Kokkos::TeamPolicy<ScheduleType, IndexType> t_policy;
|
||||
typedef typename t_policy::member_type t_team;
|
||||
using t_policy = Kokkos::TeamPolicy<ScheduleType, IndexType>;
|
||||
using t_team = typename t_policy::member_type;
|
||||
Kokkos::Timer timer;
|
||||
|
||||
for (int orep = 0; orep < outer_repeat; orep++) {
|
||||
|
||||
@ -48,219 +48,224 @@
|
||||
#include <sys/time.h>
|
||||
|
||||
#define STREAM_ARRAY_SIZE 100000000
|
||||
#define STREAM_NTIMES 20
|
||||
#define STREAM_NTIMES 20
|
||||
|
||||
#define HLINE "-------------------------------------------------------------\n"
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror StreamHostArray;
|
||||
typedef Kokkos::View<double*, Kokkos::CudaSpace> StreamDeviceArray;
|
||||
using StreamHostArray = Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror;
|
||||
using StreamDeviceArray = Kokkos::View<double*, Kokkos::CudaSpace>;
|
||||
#else
|
||||
typedef Kokkos::View<double*, Kokkos::HostSpace>::HostMirror StreamHostArray;
|
||||
typedef Kokkos::View<double*, Kokkos::HostSpace> StreamDeviceArray;
|
||||
using StreamHostArray = Kokkos::View<double*, Kokkos::HostSpace>::HostMirror;
|
||||
using StreamDeviceArray = Kokkos::View<double*, Kokkos::HostSpace>;
|
||||
#endif
|
||||
|
||||
typedef int StreamIndex;
|
||||
using StreamIndex = int;
|
||||
|
||||
double now() {
|
||||
struct timeval now;
|
||||
gettimeofday(&now, nullptr);
|
||||
struct timeval now;
|
||||
gettimeofday(&now, nullptr);
|
||||
|
||||
return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
|
||||
return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
|
||||
}
|
||||
|
||||
void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
|
||||
void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b,
|
||||
StreamDeviceArray& c) {
|
||||
Kokkos::parallel_for(
|
||||
"copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i]; });
|
||||
|
||||
Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
c[i] = a[i];
|
||||
});
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
|
||||
const double scalar) {
|
||||
void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
|
||||
StreamDeviceArray& c, const double scalar) {
|
||||
Kokkos::parallel_for(
|
||||
"copy", a.extent(0),
|
||||
KOKKOS_LAMBDA(const StreamIndex i) { b[i] = scalar * c[i]; });
|
||||
|
||||
Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
b[i] = scalar * c[i];
|
||||
});
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
void perform_add(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
|
||||
Kokkos::parallel_for("add", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
c[i] = a[i] + b[i];
|
||||
});
|
||||
void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
|
||||
StreamDeviceArray& c) {
|
||||
Kokkos::parallel_for(
|
||||
"add", a.extent(0),
|
||||
KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i] + b[i]; });
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
|
||||
const double scalar) {
|
||||
void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b,
|
||||
StreamDeviceArray& c, const double scalar) {
|
||||
Kokkos::parallel_for(
|
||||
"triad", a.extent(0),
|
||||
KOKKOS_LAMBDA(const StreamIndex i) { a[i] = b[i] + scalar * c[i]; });
|
||||
|
||||
Kokkos::parallel_for("triad", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
|
||||
a[i] = b[i] + scalar * c[i];
|
||||
});
|
||||
|
||||
Kokkos::fence();
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
int perform_validation(StreamHostArray& a, StreamHostArray& b, StreamHostArray& c,
|
||||
const StreamIndex arraySize, const double scalar) {
|
||||
int perform_validation(StreamHostArray& a, StreamHostArray& b,
|
||||
StreamHostArray& c, const StreamIndex arraySize,
|
||||
const double scalar) {
|
||||
double ai = 1.0;
|
||||
double bi = 2.0;
|
||||
double ci = 0.0;
|
||||
|
||||
double ai = 1.0;
|
||||
double bi = 2.0;
|
||||
double ci = 0.0;
|
||||
for (StreamIndex i = 0; i < arraySize; ++i) {
|
||||
ci = ai;
|
||||
bi = scalar * ci;
|
||||
ci = ai + bi;
|
||||
ai = bi + scalar * ci;
|
||||
};
|
||||
|
||||
for( StreamIndex i = 0; i < arraySize; ++i ) {
|
||||
ci = ai;
|
||||
bi = scalar * ci;
|
||||
ci = ai + bi;
|
||||
ai = bi + scalar * ci;
|
||||
};
|
||||
double aError = 0.0;
|
||||
double bError = 0.0;
|
||||
double cError = 0.0;
|
||||
|
||||
double aError = 0.0;
|
||||
double bError = 0.0;
|
||||
double cError = 0.0;
|
||||
for (StreamIndex i = 0; i < arraySize; ++i) {
|
||||
aError = std::abs(a[i] - ai);
|
||||
bError = std::abs(b[i] - bi);
|
||||
cError = std::abs(c[i] - ci);
|
||||
}
|
||||
|
||||
for( StreamIndex i = 0; i < arraySize; ++i ) {
|
||||
aError = std::abs( a[i] - ai );
|
||||
bError = std::abs( b[i] - bi );
|
||||
cError = std::abs( c[i] - ci );
|
||||
}
|
||||
double aAvgError = aError / (double)arraySize;
|
||||
double bAvgError = bError / (double)arraySize;
|
||||
double cAvgError = cError / (double)arraySize;
|
||||
|
||||
double aAvgError = aError / (double) arraySize;
|
||||
double bAvgError = bError / (double) arraySize;
|
||||
double cAvgError = cError / (double) arraySize;
|
||||
const double epsilon = 1.0e-13;
|
||||
int errorCount = 0;
|
||||
|
||||
const double epsilon = 1.0e-13;
|
||||
int errorCount = 0;
|
||||
if (std::abs(aAvgError / ai) > epsilon) {
|
||||
fprintf(stderr, "Error: validation check on View a failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
if( std::abs( aAvgError / ai ) > epsilon ) {
|
||||
fprintf(stderr, "Error: validation check on View a failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
if (std::abs(bAvgError / bi) > epsilon) {
|
||||
fprintf(stderr, "Error: validation check on View b failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
if( std::abs( bAvgError / bi ) > epsilon ) {
|
||||
fprintf(stderr, "Error: validation check on View b failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
if (std::abs(cAvgError / ci) > epsilon) {
|
||||
fprintf(stderr, "Error: validation check on View c failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
if( std::abs( cAvgError / ci ) > epsilon ) {
|
||||
fprintf(stderr, "Error: validation check on View c failed.\n");
|
||||
errorCount++;
|
||||
}
|
||||
if (errorCount == 0) {
|
||||
printf("All solutions checked and verified.\n");
|
||||
}
|
||||
|
||||
if( errorCount == 0 ) {
|
||||
printf("All solutions checked and verified.\n");
|
||||
}
|
||||
|
||||
return errorCount;
|
||||
return errorCount;
|
||||
}
|
||||
|
||||
int run_benchmark() {
|
||||
printf("Reports fastest timing per kernel\n");
|
||||
printf("Creating Views...\n");
|
||||
|
||||
printf("Reports fastest timing per kernel\n");
|
||||
printf("Creating Views...\n");
|
||||
printf("Memory Sizes:\n");
|
||||
printf("- Array Size: %" PRIu64 "\n",
|
||||
static_cast<uint64_t>(STREAM_ARRAY_SIZE));
|
||||
printf("- Per Array: %12.2f MB\n",
|
||||
1.0e-6 * (double)STREAM_ARRAY_SIZE * (double)sizeof(double));
|
||||
printf("- Total: %12.2f MB\n",
|
||||
3.0e-6 * (double)STREAM_ARRAY_SIZE * (double)sizeof(double));
|
||||
|
||||
printf("Memory Sizes:\n");
|
||||
printf("- Array Size: %" PRIu64 "\n", static_cast<uint64_t>(STREAM_ARRAY_SIZE));
|
||||
printf("- Per Array: %12.2f MB\n", 1.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
|
||||
printf("- Total: %12.2f MB\n", 3.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
|
||||
printf("Benchmark kernels will be performed for %d iterations.\n",
|
||||
STREAM_NTIMES);
|
||||
|
||||
printf("Benchmark kernels will be performed for %d iterations.\n", STREAM_NTIMES);
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
StreamDeviceArray dev_a("a", STREAM_ARRAY_SIZE);
|
||||
StreamDeviceArray dev_b("b", STREAM_ARRAY_SIZE);
|
||||
StreamDeviceArray dev_c("c", STREAM_ARRAY_SIZE);
|
||||
|
||||
StreamDeviceArray dev_a("a", STREAM_ARRAY_SIZE);
|
||||
StreamDeviceArray dev_b("b", STREAM_ARRAY_SIZE);
|
||||
StreamDeviceArray dev_c("c", STREAM_ARRAY_SIZE);
|
||||
StreamHostArray a = Kokkos::create_mirror_view(dev_a);
|
||||
StreamHostArray b = Kokkos::create_mirror_view(dev_b);
|
||||
StreamHostArray c = Kokkos::create_mirror_view(dev_c);
|
||||
|
||||
StreamHostArray a = Kokkos::create_mirror_view(dev_a);
|
||||
StreamHostArray b = Kokkos::create_mirror_view(dev_b);
|
||||
StreamHostArray c = Kokkos::create_mirror_view(dev_c);
|
||||
const double scalar = 3.0;
|
||||
|
||||
const double scalar = 3.0;
|
||||
double copyTime = std::numeric_limits<double>::max();
|
||||
double scaleTime = std::numeric_limits<double>::max();
|
||||
double addTime = std::numeric_limits<double>::max();
|
||||
double triadTime = std::numeric_limits<double>::max();
|
||||
|
||||
double copyTime = std::numeric_limits<double>::max();
|
||||
double scaleTime = std::numeric_limits<double>::max();
|
||||
double addTime = std::numeric_limits<double>::max();
|
||||
double triadTime = std::numeric_limits<double>::max();
|
||||
|
||||
printf("Initializing Views...\n");
|
||||
printf("Initializing Views...\n");
|
||||
|
||||
#if defined(KOKKOS_HAVE_OPENMP)
|
||||
Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
|
||||
Kokkos::parallel_for(
|
||||
"init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
|
||||
#else
|
||||
Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
|
||||
Kokkos::parallel_for(
|
||||
"init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
|
||||
#endif
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
KOKKOS_LAMBDA(const int i) {
|
||||
a[i] = 1.0;
|
||||
b[i] = 2.0;
|
||||
c[i] = 0.0;
|
||||
});
|
||||
|
||||
a[i] = 1.0;
|
||||
b[i] = 2.0;
|
||||
c[i] = 0.0;
|
||||
});
|
||||
// Copy contents of a (from the host) to the dev_a (device)
|
||||
Kokkos::deep_copy(dev_a, a);
|
||||
Kokkos::deep_copy(dev_b, b);
|
||||
Kokkos::deep_copy(dev_c, c);
|
||||
|
||||
// Copy contents of a (from the host) to the dev_a (device)
|
||||
Kokkos::deep_copy(dev_a, a);
|
||||
Kokkos::deep_copy(dev_b, b);
|
||||
Kokkos::deep_copy(dev_c, c);
|
||||
double start;
|
||||
|
||||
double start;
|
||||
printf("Starting benchmarking...\n");
|
||||
|
||||
printf("Starting benchmarking...\n");
|
||||
for (StreamIndex k = 0; k < STREAM_NTIMES; ++k) {
|
||||
start = now();
|
||||
perform_copy(dev_a, dev_b, dev_c);
|
||||
copyTime = std::min(copyTime, (now() - start));
|
||||
|
||||
for( StreamIndex k = 0; k < STREAM_NTIMES; ++k ) {
|
||||
start = now();
|
||||
perform_copy(dev_a, dev_b, dev_c);
|
||||
copyTime = std::min( copyTime, (now() - start) );
|
||||
start = now();
|
||||
perform_scale(dev_a, dev_b, dev_c, scalar);
|
||||
scaleTime = std::min(scaleTime, (now() - start));
|
||||
|
||||
start = now();
|
||||
perform_scale(dev_a, dev_b, dev_c, scalar);
|
||||
scaleTime = std::min( scaleTime, (now() - start) );
|
||||
start = now();
|
||||
perform_add(dev_a, dev_b, dev_c);
|
||||
addTime = std::min(addTime, (now() - start));
|
||||
|
||||
start = now();
|
||||
perform_add(dev_a, dev_b, dev_c);
|
||||
addTime = std::min( addTime, (now() - start) );
|
||||
start = now();
|
||||
perform_triad(dev_a, dev_b, dev_c, scalar);
|
||||
triadTime = std::min(triadTime, (now() - start));
|
||||
}
|
||||
|
||||
start = now();
|
||||
perform_triad(dev_a, dev_b, dev_c, scalar);
|
||||
triadTime = std::min( triadTime, (now() - start) );
|
||||
}
|
||||
Kokkos::deep_copy(a, dev_a);
|
||||
Kokkos::deep_copy(b, dev_b);
|
||||
Kokkos::deep_copy(c, dev_c);
|
||||
|
||||
Kokkos::deep_copy(a, dev_a);
|
||||
Kokkos::deep_copy(b, dev_b);
|
||||
Kokkos::deep_copy(c, dev_c);
|
||||
printf("Performing validation...\n");
|
||||
int rc = perform_validation(a, b, c, STREAM_ARRAY_SIZE, scalar);
|
||||
|
||||
printf("Performing validation...\n");
|
||||
int rc = perform_validation(a, b, c, STREAM_ARRAY_SIZE, scalar);
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
printf("Copy %11.2f MB/s\n",
|
||||
(1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
|
||||
copyTime);
|
||||
printf("Scale %11.2f MB/s\n",
|
||||
(1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
|
||||
scaleTime);
|
||||
printf("Add %11.2f MB/s\n",
|
||||
(1.0e-06 * 3.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
|
||||
addTime);
|
||||
printf("Triad %11.2f MB/s\n",
|
||||
(1.0e-06 * 3.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
|
||||
triadTime);
|
||||
|
||||
printf("Copy %11.2f MB/s\n",
|
||||
( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / copyTime );
|
||||
printf("Scale %11.2f MB/s\n",
|
||||
( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / scaleTime );
|
||||
printf("Add %11.2f MB/s\n",
|
||||
( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / addTime );
|
||||
printf("Triad %11.2f MB/s\n",
|
||||
( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / triadTime );
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
|
||||
return rc;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
printf(HLINE);
|
||||
printf("Kokkos STREAM Benchmark\n");
|
||||
printf(HLINE);
|
||||
|
||||
printf(HLINE);
|
||||
printf("Kokkos STREAM Benchmark\n");
|
||||
printf(HLINE);
|
||||
Kokkos::initialize(argc, argv);
|
||||
const int rc = run_benchmark();
|
||||
Kokkos::finalize();
|
||||
|
||||
Kokkos::initialize(argc, argv);
|
||||
const int rc = run_benchmark();
|
||||
Kokkos::finalize();
|
||||
|
||||
return rc;
|
||||
return rc;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user