Add missed changes

This commit is contained in:
Stan Gerald Moore
2023-06-05 10:33:30 -06:00
parent 2e09ba2702
commit 966efd8bd5
3 changed files with 34 additions and 33 deletions

View File

@ -710,7 +710,7 @@ struct CudaParallelLaunchImpl<
" occupancy requests are currently broken."));
}
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
ensure_cuda_lock_arrays_on_device();
// Invoke the driver function on the device
base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);

View File

@ -79,8 +79,7 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
void initialize_host_cuda_lock_arrays() {
#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
desul::Impl::init_lock_arrays();
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
desul::ensure_cuda_lock_arrays_on_device();
#endif
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
KOKKOS_IMPL_CUDA_SAFE_CALL(
@ -89,7 +88,7 @@ void initialize_host_cuda_lock_arrays() {
Impl::cuda_device_synchronize(
"Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
g_host_cuda_lock_arrays.n = Cuda::concurrency();
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
copy_cuda_lock_arrays_to_device();
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
256>>>();
Impl::cuda_device_synchronize(
@ -106,7 +105,7 @@ void finalize_host_cuda_lock_arrays() {
g_host_cuda_lock_arrays.atomic = nullptr;
g_host_cuda_lock_arrays.n = 0;
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
copy_cuda_lock_arrays_to_device();
#endif
}

View File

@ -67,7 +67,7 @@ struct CudaLockArrays {
/// \brief This global variable in Host space is the central definition
/// of these arrays.
extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays;
extern CudaLockArrays g_host_cuda_lock_arrays;
/// \brief After this call, the g_host_cuda_lock_arrays variable has
/// valid, initialized arrays.
@ -105,12 +105,12 @@ namespace Impl {
/// instances in other translation units, we must update this CUDA global
/// variable based on the Host global variable prior to running any kernels
/// that will use it.
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
__device__
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
__constant__ extern
#endif
Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays;
CudaLockArrays g_device_cuda_lock_arrays;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
@ -123,9 +123,7 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
return (
0 ==
atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
}
/// \brief Release lock for the address
@ -138,7 +136,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0);
atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
}
} // namespace Impl
@ -151,45 +149,49 @@ namespace {
static int lock_array_copied = 0;
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
} // namespace
} // namespace Impl
} // namespace Kokkos
/* Dan Ibanez: it is critical that this code be a macro, so that it will
capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
putting this in an inline function will NOT do the right thing! */
#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
{ \
if (::Kokkos::Impl::lock_array_copied == 0) { \
KOKKOS_IMPL_CUDA_SAFE_CALL( \
cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
&Kokkos::Impl::g_host_cuda_lock_arrays, \
sizeof(Kokkos::Impl::CudaLockArrays))); \
} \
lock_array_copied = 1; \
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
inline
#else
inline static
#endif
void
copy_cuda_lock_arrays_to_device() {
if (lock_array_copied == 0) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
&g_host_cuda_lock_arrays,
sizeof(CudaLockArrays)));
}
lock_array_copied = 1;
}
#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
inline void ensure_cuda_lock_arrays_on_device() {}
#else
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
inline static void ensure_cuda_lock_arrays_on_device() {
copy_cuda_lock_arrays_to_device();
}
#endif
#else
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
inline void ensure_cuda_lock_arrays_on_device() {}
#else
// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
inline static void ensure_cuda_lock_arrays_on_device() {
copy_cuda_lock_arrays_to_device();
desul::ensure_cuda_lock_arrays_on_device();
}
#endif
#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
} // namespace Impl
} // namespace Kokkos
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */