Add missed changes
This commit is contained in:
@ -710,7 +710,7 @@ struct CudaParallelLaunchImpl<
|
|||||||
" occupancy requests are currently broken."));
|
" occupancy requests are currently broken."));
|
||||||
}
|
}
|
||||||
|
|
||||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
ensure_cuda_lock_arrays_on_device();
|
||||||
|
|
||||||
// Invoke the driver function on the device
|
// Invoke the driver function on the device
|
||||||
base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
|
base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
|
||||||
|
|||||||
@ -79,8 +79,7 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
|
|||||||
void initialize_host_cuda_lock_arrays() {
|
void initialize_host_cuda_lock_arrays() {
|
||||||
#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
|
#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
|
||||||
desul::Impl::init_lock_arrays();
|
desul::Impl::init_lock_arrays();
|
||||||
|
desul::ensure_cuda_lock_arrays_on_device();
|
||||||
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
|
||||||
#endif
|
#endif
|
||||||
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
|
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
|
||||||
KOKKOS_IMPL_CUDA_SAFE_CALL(
|
KOKKOS_IMPL_CUDA_SAFE_CALL(
|
||||||
@ -89,7 +88,7 @@ void initialize_host_cuda_lock_arrays() {
|
|||||||
Impl::cuda_device_synchronize(
|
Impl::cuda_device_synchronize(
|
||||||
"Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
|
"Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
|
||||||
g_host_cuda_lock_arrays.n = Cuda::concurrency();
|
g_host_cuda_lock_arrays.n = Cuda::concurrency();
|
||||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
copy_cuda_lock_arrays_to_device();
|
||||||
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
|
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
|
||||||
256>>>();
|
256>>>();
|
||||||
Impl::cuda_device_synchronize(
|
Impl::cuda_device_synchronize(
|
||||||
@ -106,7 +105,7 @@ void finalize_host_cuda_lock_arrays() {
|
|||||||
g_host_cuda_lock_arrays.atomic = nullptr;
|
g_host_cuda_lock_arrays.atomic = nullptr;
|
||||||
g_host_cuda_lock_arrays.n = 0;
|
g_host_cuda_lock_arrays.n = 0;
|
||||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
copy_cuda_lock_arrays_to_device();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -67,7 +67,7 @@ struct CudaLockArrays {
|
|||||||
|
|
||||||
/// \brief This global variable in Host space is the central definition
|
/// \brief This global variable in Host space is the central definition
|
||||||
/// of these arrays.
|
/// of these arrays.
|
||||||
extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays;
|
extern CudaLockArrays g_host_cuda_lock_arrays;
|
||||||
|
|
||||||
/// \brief After this call, the g_host_cuda_lock_arrays variable has
|
/// \brief After this call, the g_host_cuda_lock_arrays variable has
|
||||||
/// valid, initialized arrays.
|
/// valid, initialized arrays.
|
||||||
@ -105,12 +105,12 @@ namespace Impl {
|
|||||||
/// instances in other translation units, we must update this CUDA global
|
/// instances in other translation units, we must update this CUDA global
|
||||||
/// variable based on the Host global variable prior to running any kernels
|
/// variable based on the Host global variable prior to running any kernels
|
||||||
/// that will use it.
|
/// that will use it.
|
||||||
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
|
/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
|
||||||
__device__
|
__device__
|
||||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||||
__constant__ extern
|
__constant__ extern
|
||||||
#endif
|
#endif
|
||||||
Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays;
|
CudaLockArrays g_device_cuda_lock_arrays;
|
||||||
|
|
||||||
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
||||||
|
|
||||||
@ -123,9 +123,7 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
|
|||||||
size_t offset = size_t(ptr);
|
size_t offset = size_t(ptr);
|
||||||
offset = offset >> 2;
|
offset = offset >> 2;
|
||||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||||
return (
|
return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
|
||||||
0 ==
|
|
||||||
atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Release lock for the address
|
/// \brief Release lock for the address
|
||||||
@ -138,7 +136,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
|
|||||||
size_t offset = size_t(ptr);
|
size_t offset = size_t(ptr);
|
||||||
offset = offset >> 2;
|
offset = offset >> 2;
|
||||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||||
atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0);
|
atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace Impl
|
} // namespace Impl
|
||||||
@ -151,45 +149,49 @@ namespace {
|
|||||||
static int lock_array_copied = 0;
|
static int lock_array_copied = 0;
|
||||||
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
|
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
|
||||||
} // namespace
|
} // namespace
|
||||||
} // namespace Impl
|
|
||||||
} // namespace Kokkos
|
|
||||||
|
|
||||||
/* Dan Ibanez: it is critical that this code be a macro, so that it will
|
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||||
capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
|
inline
|
||||||
putting this in an inline function will NOT do the right thing! */
|
#else
|
||||||
#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
|
inline static
|
||||||
{ \
|
#endif
|
||||||
if (::Kokkos::Impl::lock_array_copied == 0) { \
|
void
|
||||||
KOKKOS_IMPL_CUDA_SAFE_CALL( \
|
copy_cuda_lock_arrays_to_device() {
|
||||||
cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
|
if (lock_array_copied == 0) {
|
||||||
&Kokkos::Impl::g_host_cuda_lock_arrays, \
|
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
|
||||||
sizeof(Kokkos::Impl::CudaLockArrays))); \
|
&g_host_cuda_lock_arrays,
|
||||||
} \
|
sizeof(CudaLockArrays)));
|
||||||
lock_array_copied = 1; \
|
|
||||||
}
|
}
|
||||||
|
lock_array_copied = 1;
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
|
#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
|
||||||
|
|
||||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
inline void ensure_cuda_lock_arrays_on_device() {}
|
||||||
#else
|
#else
|
||||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
|
inline static void ensure_cuda_lock_arrays_on_device() {
|
||||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
|
copy_cuda_lock_arrays_to_device();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
inline void ensure_cuda_lock_arrays_on_device() {}
|
||||||
#else
|
#else
|
||||||
// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
|
// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
|
||||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
|
inline static void ensure_cuda_lock_arrays_on_device() {
|
||||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
|
copy_cuda_lock_arrays_to_device();
|
||||||
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
desul::ensure_cuda_lock_arrays_on_device();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
|
#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
|
||||||
|
|
||||||
|
} // namespace Impl
|
||||||
|
} // namespace Kokkos
|
||||||
|
|
||||||
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
||||||
|
|
||||||
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
|
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
|
||||||
|
|||||||
Reference in New Issue
Block a user