diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index ba43e362bb..a91d0eb313 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -710,7 +710,7 @@ struct CudaParallelLaunchImpl< " occupancy requests are currently broken.")); } - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); + ensure_cuda_lock_arrays_on_device(); // Invoke the driver function on the device base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp index 84d4307cfd..3796534816 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp @@ -79,8 +79,7 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0}; void initialize_host_cuda_lock_arrays() { #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::Impl::init_lock_arrays(); - - DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); + desul::ensure_cuda_lock_arrays_on_device(); #endif if (g_host_cuda_lock_arrays.atomic != nullptr) return; KOKKOS_IMPL_CUDA_SAFE_CALL( @@ -89,7 +88,7 @@ void initialize_host_cuda_lock_arrays() { Impl::cuda_device_synchronize( "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays"); g_host_cuda_lock_arrays.n = Cuda::concurrency(); - KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); + copy_cuda_lock_arrays_to_device(); init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>(); Impl::cuda_device_synchronize( @@ -106,7 +105,7 @@ void finalize_host_cuda_lock_arrays() { g_host_cuda_lock_arrays.atomic = nullptr; g_host_cuda_lock_arrays.n = 0; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); + copy_cuda_lock_arrays_to_device(); #endif } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp index bdb7723985..84bfc953fd 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp @@ -67,7 +67,7 @@ struct CudaLockArrays { /// \brief This global variable in Host space is the central definition /// of these arrays. -extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays; +extern CudaLockArrays g_host_cuda_lock_arrays; /// \brief After this call, the g_host_cuda_lock_arrays variable has /// valid, initialized arrays. @@ -105,12 +105,12 @@ namespace Impl { /// instances in other translation units, we must update this CUDA global /// variable based on the Host global variable prior to running any kernels /// that will use it. -/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro. +/// That is the purpose of the ensure_cuda_lock_arrays_on_device function. __device__ #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE __constant__ extern #endif - Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays; + CudaLockArrays g_device_cuda_lock_arrays; #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF @@ -123,9 +123,7 @@ __device__ inline bool lock_address_cuda_space(void* ptr) { size_t offset = size_t(ptr); offset = offset >> 2; offset = offset & CUDA_SPACE_ATOMIC_MASK; - return ( - 0 == - atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1)); + return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1)); } /// \brief Release lock for the address @@ -138,7 +136,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) { size_t offset = size_t(ptr); offset = offset >> 2; offset = offset & CUDA_SPACE_ATOMIC_MASK; - atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0); + atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0); } } // namespace Impl @@ -151,45 +149,49 @@ namespace { static int lock_array_copied = 0; inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace -} // namespace Impl -} // namespace Kokkos -/* Dan Ibanez: it is critical that this code be a macro, so that it will - capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays! - putting this in an inline function will NOT do the right thing! */ -#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ - { \ - if (::Kokkos::Impl::lock_array_copied == 0) { \ - KOKKOS_IMPL_CUDA_SAFE_CALL( \ - cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \ - &Kokkos::Impl::g_host_cuda_lock_arrays, \ - sizeof(Kokkos::Impl::CudaLockArrays))); \ - } \ - lock_array_copied = 1; \ +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE +inline +#else +inline static +#endif + void + copy_cuda_lock_arrays_to_device() { + if (lock_array_copied == 0) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays, + &g_host_cuda_lock_arrays, + sizeof(CudaLockArrays))); } + lock_array_copied = 1; +} #ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() +inline void ensure_cuda_lock_arrays_on_device() {} #else -#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ - KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() +inline static void ensure_cuda_lock_arrays_on_device() { + copy_cuda_lock_arrays_to_device(); +} #endif #else #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() +inline void ensure_cuda_lock_arrays_on_device() {} #else // Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc. -#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ - KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ - DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() +inline static void ensure_cuda_lock_arrays_on_device() { + copy_cuda_lock_arrays_to_device(); + desul::ensure_cuda_lock_arrays_on_device(); +} #endif #endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */ +} // namespace Impl +} // namespace Kokkos + #endif /* defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */