From a21a09f6d30a27b46ce40e5f834b8380e07dd861 Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Mon, 21 Nov 2022 14:57:55 -0700 Subject: [PATCH] Backport https://github.com/kokkos/kokkos/pull/5624 to Kokkos version bundled with LAMMPS --- .../core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index 993c8d1bba..56f9117844 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -59,7 +59,11 @@ inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties, // Limits due do registers/SM int const regs_per_sm = properties.regsPerMultiprocessor; int const regs_per_thread = attributes.numRegs; - int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size); + // The granularity of register allocation is chunks of 256 registers per warp + // -> 8 registers per thread + int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); + int const max_blocks_regs = + regs_per_sm / (allocated_regs_per_thread * block_size); // Limits due to shared memory/SM size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;