Update Kokkos library in LAMMPS to v4.4.0

2024-09-11 09:20:36 -06:00
parent 16b19c71c1
commit a44955dd2e
254 changed files with 14227 additions and 9881 deletions
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp
@ -1,18 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#include <TestCuda_Category.hpp>
-#include <TestGraph.hpp>
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp
@ -15,7 +15,7 @@
 //@HEADER

 #include <TestCuda_Category.hpp>
-#include <Test_InterOp_Streams.hpp>
+#include <TestMultiGPU.hpp>

 namespace {

@ -57,79 +57,6 @@ std::array<TEST_EXECSPACE, 2> get_execution_spaces(
  return {exec0, exec1};
 }

-// Test Interoperability with Cuda Streams
-void test_policies(TEST_EXECSPACE exec0, Kokkos::View<int *, TEST_EXECSPACE> v0,
-                   TEST_EXECSPACE exec, Kokkos::View<int *, TEST_EXECSPACE> v) {
-  using MemorySpace = typename TEST_EXECSPACE::memory_space;
-
-  Kokkos::deep_copy(exec, v, 5);
-  Kokkos::deep_copy(exec0, v0, 5);
-
-  Kokkos::deep_copy(v, v0);
-
-  int sum;
-  int sum0;
-
-  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range_0",
-                       Kokkos::RangePolicy<TEST_EXECSPACE>(exec0, 0, 100),
-                       Test::FunctorRange<MemorySpace>(v0));
-  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range",
-                       Kokkos::RangePolicy<TEST_EXECSPACE>(exec, 0, 100),
-                       Test::FunctorRange<MemorySpace>(v));
-  Kokkos::parallel_reduce(
-      "Test::cuda::raw_cuda_stream::RangeReduce_0",
-      Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec0,
-                                                                        0, 100),
-      Test::FunctorRangeReduce<MemorySpace>(v0), sum0);
-  Kokkos::parallel_reduce(
-      "Test::cuda::raw_cuda_stream::RangeReduce",
-      Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec, 0,
-                                                                        100),
-      Test::FunctorRangeReduce<MemorySpace>(v), sum);
-  ASSERT_EQ(600, sum0);
-  ASSERT_EQ(600, sum);
-
-  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange_0",
-                       Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
-                           exec0, {0, 0}, {10, 10}),
-                       Test::FunctorMDRange<MemorySpace>(v0));
-  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange",
-                       Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
-                           exec, {0, 0}, {10, 10}),
-                       Test::FunctorMDRange<MemorySpace>(v));
-  Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce_0",
-                          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>,
-                                                Kokkos::LaunchBounds<128, 2>>(
-                              exec0, {0, 0}, {10, 10}),
-                          Test::FunctorMDRangeReduce<MemorySpace>(v0), sum0);
-  Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce",
-                          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>,
-                                                Kokkos::LaunchBounds<128, 2>>(
-                              exec, {0, 0}, {10, 10}),
-                          Test::FunctorMDRangeReduce<MemorySpace>(v), sum);
-  ASSERT_EQ(700, sum0);
-  ASSERT_EQ(700, sum);
-
-  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team_0",
-                       Kokkos::TeamPolicy<TEST_EXECSPACE>(exec0, 10, 10),
-                       Test::FunctorTeam<MemorySpace, TEST_EXECSPACE>(v0));
-  Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team",
-                       Kokkos::TeamPolicy<TEST_EXECSPACE>(exec, 10, 10),
-                       Test::FunctorTeam<MemorySpace, TEST_EXECSPACE>(v));
-  Kokkos::parallel_reduce(
-      "Test::cuda::raw_cuda_stream::Team_0",
-      Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec0,
-                                                                       10, 10),
-      Test::FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v0), sum0);
-  Kokkos::parallel_reduce(
-      "Test::cuda::raw_cuda_stream::Team",
-      Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec, 10,
-                                                                       10),
-      Test::FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v), sum);
-  ASSERT_EQ(800, sum0);
-  ASSERT_EQ(800, sum);
-}
-
 TEST(cuda_multi_gpu, managed_views) {
  StreamsAndDevices streams_and_devices;
  {
@ -169,93 +96,6 @@ TEST(cuda_multi_gpu, unmanaged_views) {
  }
 }

-struct ScratchFunctor {
-  int scratch_size;
-  int R;
-
-  ScratchFunctor(int scratch_size_, int R_)
-      : scratch_size(scratch_size_), R(R_) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const Kokkos::TeamPolicy<Kokkos::Cuda>::member_type &team,
-                  int &error_accum) const {
-    Kokkos::View<int *, Kokkos::Cuda::scratch_memory_space> scratch_mem(
-        team.team_scratch(1), scratch_size);
-
-    // Initialize scratch memory
-    Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size),
-                         [&](int i) { scratch_mem(i) = 0; });
-    team.team_barrier();
-
-    // Increment each entry in scratch memory R times
-    for (int r = 0; r < R; ++r) {
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size),
-                           [&](int i) { scratch_mem(i) += 1; });
-    }
-    team.team_barrier();
-
-    // Check that each scratch entry has been incremented exactly R times
-    int team_error_accum;
-    auto R_loc = R;  // avoid implicit capture of this
-    Kokkos::parallel_reduce(
-        Kokkos::TeamVectorRange(team, 0, scratch_size),
-        [&](int i, int &tsum) {
-          if (scratch_mem(i) != R_loc) {
-            tsum += 1;
-          }
-        },
-        team_error_accum);
-    Kokkos::single(Kokkos::PerTeam(team),
-                   [&]() { error_accum += team_error_accum; });
-  }
-};
-
-void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) {
-  constexpr int N            = 10;
-  constexpr int R            = 1000;
-  constexpr int scratch_size = 100;
-  using ScratchType = Kokkos::View<int *, Kokkos::Cuda::scratch_memory_space>;
-
-  // Test allocating and using scratch space
-  ScratchFunctor f(scratch_size, R);
-
-  auto policy0 =
-      Kokkos::TeamPolicy<Kokkos::Cuda>(exec0, N, 10)
-          .set_scratch_size(
-              1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size)));
-  auto policy1 =
-      Kokkos::TeamPolicy<Kokkos::Cuda>(exec1, N, 10)
-          .set_scratch_size(
-              1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size)));
-
-  int error0, error1;
-
-  Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0);
-  Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1);
-  ASSERT_EQ(error0, 0);
-  ASSERT_EQ(error1, 0);
-
-  // Request larger scratch size to trigger a realloc and test
-  const auto new_scratch_size = scratch_size + 10;
-  ScratchFunctor f_more_scratch(new_scratch_size, R);
-
-  auto policy0_more_scratch =
-      Kokkos::TeamPolicy<Kokkos::Cuda>(exec0, N, 10)
-          .set_scratch_size(
-              1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size)));
-  auto policy1_more_scratch =
-      Kokkos::TeamPolicy<Kokkos::Cuda>(exec1, N, 10)
-          .set_scratch_size(
-              1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size)));
-
-  Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch,
-                          f_more_scratch, error0);
-  Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch,
-                          f_more_scratch, error1);
-  ASSERT_EQ(error0, 0);
-  ASSERT_EQ(error1, 0);
-}
-
 TEST(cuda_multi_gpu, scratch_space) {
  StreamsAndDevices streams_and_devices;
  {