Add missed changes

This commit is contained in:
Stan Gerald Moore
2023-06-05 10:33:30 -06:00
parent 2e09ba2702
commit 966efd8bd5
3 changed files with 34 additions and 33 deletions

View File

@ -710,7 +710,7 @@ struct CudaParallelLaunchImpl<
" occupancy requests are currently broken.")); " occupancy requests are currently broken."));
} }
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); ensure_cuda_lock_arrays_on_device();
// Invoke the driver function on the device // Invoke the driver function on the device
base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);

View File

@ -79,8 +79,7 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
void initialize_host_cuda_lock_arrays() { void initialize_host_cuda_lock_arrays() {
#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
desul::Impl::init_lock_arrays(); desul::Impl::init_lock_arrays();
desul::ensure_cuda_lock_arrays_on_device();
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
#endif #endif
if (g_host_cuda_lock_arrays.atomic != nullptr) return; if (g_host_cuda_lock_arrays.atomic != nullptr) return;
KOKKOS_IMPL_CUDA_SAFE_CALL( KOKKOS_IMPL_CUDA_SAFE_CALL(
@ -89,7 +88,7 @@ void initialize_host_cuda_lock_arrays() {
Impl::cuda_device_synchronize( Impl::cuda_device_synchronize(
"Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays"); "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
g_host_cuda_lock_arrays.n = Cuda::concurrency(); g_host_cuda_lock_arrays.n = Cuda::concurrency();
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); copy_cuda_lock_arrays_to_device();
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
256>>>(); 256>>>();
Impl::cuda_device_synchronize( Impl::cuda_device_synchronize(
@ -106,7 +105,7 @@ void finalize_host_cuda_lock_arrays() {
g_host_cuda_lock_arrays.atomic = nullptr; g_host_cuda_lock_arrays.atomic = nullptr;
g_host_cuda_lock_arrays.n = 0; g_host_cuda_lock_arrays.n = 0;
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); copy_cuda_lock_arrays_to_device();
#endif #endif
} }

View File

@ -67,7 +67,7 @@ struct CudaLockArrays {
/// \brief This global variable in Host space is the central definition /// \brief This global variable in Host space is the central definition
/// of these arrays. /// of these arrays.
extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays; extern CudaLockArrays g_host_cuda_lock_arrays;
/// \brief After this call, the g_host_cuda_lock_arrays variable has /// \brief After this call, the g_host_cuda_lock_arrays variable has
/// valid, initialized arrays. /// valid, initialized arrays.
@ -105,12 +105,12 @@ namespace Impl {
/// instances in other translation units, we must update this CUDA global /// instances in other translation units, we must update this CUDA global
/// variable based on the Host global variable prior to running any kernels /// variable based on the Host global variable prior to running any kernels
/// that will use it. /// that will use it.
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro. /// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
__device__ __device__
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
__constant__ extern __constant__ extern
#endif #endif
Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays; CudaLockArrays g_device_cuda_lock_arrays;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
@ -123,9 +123,7 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr); size_t offset = size_t(ptr);
offset = offset >> 2; offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK; offset = offset & CUDA_SPACE_ATOMIC_MASK;
return ( return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
0 ==
atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
} }
/// \brief Release lock for the address /// \brief Release lock for the address
@ -138,7 +136,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr); size_t offset = size_t(ptr);
offset = offset >> 2; offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK; offset = offset & CUDA_SPACE_ATOMIC_MASK;
atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0); atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
} }
} // namespace Impl } // namespace Impl
@ -151,45 +149,49 @@ namespace {
static int lock_array_copied = 0; static int lock_array_copied = 0;
inline int eliminate_warning_for_lock_array() { return lock_array_copied; } inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
} // namespace } // namespace
} // namespace Impl
} // namespace Kokkos
/* Dan Ibanez: it is critical that this code be a macro, so that it will #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays! inline
putting this in an inline function will NOT do the right thing! */ #else
#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ inline static
{ \ #endif
if (::Kokkos::Impl::lock_array_copied == 0) { \ void
KOKKOS_IMPL_CUDA_SAFE_CALL( \ copy_cuda_lock_arrays_to_device() {
cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \ if (lock_array_copied == 0) {
&Kokkos::Impl::g_host_cuda_lock_arrays, \ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
sizeof(Kokkos::Impl::CudaLockArrays))); \ &g_host_cuda_lock_arrays,
} \ sizeof(CudaLockArrays)));
lock_array_copied = 1; \
} }
lock_array_copied = 1;
}
#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() inline void ensure_cuda_lock_arrays_on_device() {}
#else #else
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ inline static void ensure_cuda_lock_arrays_on_device() {
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() copy_cuda_lock_arrays_to_device();
}
#endif #endif
#else #else
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() inline void ensure_cuda_lock_arrays_on_device() {}
#else #else
// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc. // Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ inline static void ensure_cuda_lock_arrays_on_device() {
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ copy_cuda_lock_arrays_to_device();
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() desul::ensure_cuda_lock_arrays_on_device();
}
#endif #endif
#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */ #endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
} // namespace Impl
} // namespace Kokkos
#endif /* defined( KOKKOS_ENABLE_CUDA ) */ #endif /* defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */ #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */