// ************************************************************************** // preprocessor.h // ------------------- // W. Michael Brown (ORNL) // Nitin Dhamankar (Intel) // // Device-side preprocessor definitions // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : brownw@ornl.gov // ***************************************************************************/ //************************************************************************* // Device Configuration Definitions // // For OpenCL, the configuration is a string (optionally controlled at // runtime) where tokens specify the values below in order) // // CONFIG_ID: // Definition: Unique ID for a configuration // 100-199 for NVIDIA GPUs with CUDA / HIP // 200-299 for NVIDIA GPUs with OpenCL // 300-399 for AMD GPUs with HIP // 400-499 for AMD GPUs with OpenCL // 500-599 for Intel GPUs with OpenCL // SIMD_SIZE: // Definition: For CUDA this is the warp size. // For AMD this is the wavefront size. // For OpenCL < 2.1 this is the number of workitems // guarenteed to have the same instruction pointer // For OpenCL >= 2.1 this is the smallest expected subgroup // size. Actually subgroup sizes are determined per kernel. // MEM_THREADS // Definition: Number of elements in main memory transaction. Used in // PPPM. If unknown, set to SIMD_SIZE. // SHUFFLE_AVAIL // Definition: Controls the use of instructions for horizontal vector // operations. 0 disables and will increase shared memory // usage. 1 enables for CUDA, HIP, and OpenCL >= 2.1 on // NVIDIA and Intel devices. // FAST_MATH // Definition: 0: do not use -cl-fast-relaxed-math optimization flag or // native transcendentals for OpenCL (fused multiply-add // still enabled). For CUDA and HIP, this is controlled by // the Makefile at compile time. 1: enable fast math opts // // THREADS_PER_ATOM // Definition: Default number of work items or CUDA threads assigned per // per atom for pair styles // Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE // THREADS_PER_CHARGE // Definition: Default number of work items or CUDA threads assigned per // per atom for pair styles using charge // Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE // THREADS_PER_THREE // Definition: Default number of work items or CUDA threads assigned per // per atom for 3-body styles // Restrictions: Must be power of 2; THREADS_PER_ATOM^2<=SIMD_SIZE // // BLOCK_PAIR // Definition: Default block size for pair styles // Restrictions: Must be integer multiple of SIMD_SIZE // BLOCK_BIO_PAIR // Definition: Default block size for CHARMM styles // Restrictions: Must be integer multiple of SIMD_SIZE // BLOCK_ELLIPSE // Definition: Default block size for ellipsoidal models and some 3-body // styles // Restrictions: Must be integer multiple of SIMD_SIZE // PPPM_BLOCK_1D // Definition: Default block size for PPPM kernels // Restrictions: Must be integer multiple of SIMD_SIZE // BLOCK_NBOR_BUILD // Definition: Default block size for neighbor list builds // Restrictions: Must be integer multiple of SIMD_SIZE // BLOCK_CELL_2D // Definition: Default block size in each dimension for matrix transpose // BLOCK_CELL_ID // Definition: Unused in current implementation; Maintained for legacy // purposes and specialized builds // // MAX_SHARED_TYPES 8 // Definition: Max # of atom type params can be stored in shared memory // Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR // MAX_BIO_SHARED_TYPES // Definition: Max # of atom type params can be stored in shared memory // Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 // PPPM_MAX_SPLINE // Definition: Maximum order for splines in PPPM // Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE // //*************************************************************************/ // ------------------------------------------------------------------------- // CUDA and HIP DEFINITIONS // ------------------------------------------------------------------------- #if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_pre_cuda_hip.h" #endif // ------------------------------------------------------------------------- // OPENCL DEVICE CONFIGURATAIONS // ------------------------------------------------------------------------- // See lal_pre_ocl_config.h for OpenCL device configurations #if !defined(NV_KERNEL) && !defined(USE_HIP) #define USE_OPENCL // ------------------------------------------------------------------------- // OPENCL KERNEL MACROS // ------------------------------------------------------------------------- #if (__OPENCL_VERSION__ > 199) #define NOUNROLL __attribute__((opencl_unroll_hint(1))) #else #define NOUNROLL #endif #define GLOBAL_ID_X get_global_id(0) #define THREAD_ID_X get_local_id(0) #define BLOCK_ID_X get_group_id(0) #define BLOCK_SIZE_X get_local_size(0) #define GLOBAL_SIZE_X get_global_size(0) #define THREAD_ID_Y get_local_id(1) #define BLOCK_ID_Y get_group_id(1) #define NUM_BLOCKS_X get_num_groups(0) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define ucl_inline inline // ------------------------------------------------------------------------- // OPENCL KERNEL MACROS - TEXTURES // ------------------------------------------------------------------------- #define fetch4(ans,i,x) ans=x[i] #define fetch(ans,i,q) ans=q[i] // ------------------------------------------------------------------------- // OPENCL KERNEL MACROS - MATH // ------------------------------------------------------------------------- #ifndef _SINGLE_SINGLE #ifndef cl_khr_fp64 #ifndef cl_amd_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #endif #endif #if defined(cl_khr_fp64) #pragma OPENCL EXTENSION cl_khr_fp64 : enable #elif defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #define fast_mul(X,Y) (X)*(Y) #define ucl_atan atan #define ucl_cbrt cbrt #define ucl_ceil ceil #define ucl_abs fabs #if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE) #define ucl_exp native_exp #define ucl_powr native_powr #define ucl_rsqrt native_rsqrt #define ucl_sqrt native_sqrt #define ucl_recip native_recip #else #define ucl_exp exp #define ucl_powr powr #define ucl_rsqrt rsqrt #define ucl_sqrt sqrt #define ucl_recip(x) ((numtyp)1.0/(x)) #endif // ------------------------------------------------------------------------- // OPENCL KERNEL MACROS - SHUFFLE // ------------------------------------------------------------------------- #if (SHUFFLE_AVAIL == 1) #ifdef cl_intel_subgroups #pragma OPENCL EXTENSION cl_intel_subgroups : enable #define shfl_down(var, delta, width) \ intel_sub_group_shuffle_down(var, var, delta) #define shfl_xor(var, lanemask, width) \ intel_sub_group_shuffle_xor(var, lanemask) #define simd_broadcast_i(var, src, width) sub_group_broadcast(var, src) #define simd_broadcast_f(var, src, width) sub_group_broadcast(var, src) #define simd_broadcast_d(var, src, width) sub_group_broadcast(var, src) #else #ifdef _SINGLE_SINGLE inline float shfl_down(float var, unsigned int delta, int width) { float ret; int c; c = ((SIMD_SIZE-width) << 8) | 0x1f; asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c), "r"(0xffffffff)); return ret; } inline float shfl_xor(float var, unsigned int lanemask, int width) { float ret; int c; c = ((SIMD_SIZE-width) << 8) | 0x1f; asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(lanemask), "r"(c), "r"(0xffffffff)); return ret; } #else inline double shfl_down(double var, unsigned int delta, int width) { int c = ((SIMD_SIZE-width) << 8) | 0x1f; int x,y,x2,y2; double ans; asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var)); asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(delta), "r"(c), "r"(0xffffffff)); asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(delta), "r"(c), "r"(0xffffffff)); asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2)); return ans; } inline double shfl_xor(double var, unsigned int lanemask, int width) { int c = ((SIMD_SIZE-width) << 8) | 0x1f; int x,y,x2,y2; double ans; asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var)); asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(lanemask), "r"(c), "r"(0xffffffff)); asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(lanemask), "r"(c), "r"(0xffffffff)); asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2)); return ans; } #endif inline int simd_broadcast_i(int var, unsigned int src, int width) { int ret; int c; c = ((SIMD_SIZE-width) << 8) | 0x1f; asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff)); return ret; } inline float simd_broadcast_f(float var, unsigned int src, int width) { float ret; int c; c = ((SIMD_SIZE-width) << 8) | 0x1f; asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff)); return ret; } #ifdef _DOUBLE_DOUBLE inline double simd_broadcast_d(double var, unsigned int src, int width) { int c = ((SIMD_SIZE-width) << 8) | 0x1f; int x,y,x2,y2; double ans; asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var)); asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(src), "r"(c), "r"(0xffffffff)); asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(src), "r"(c), "r"(0xffffffff)); asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2)); return ans; } #endif #endif #endif // ------------------------------------------------------------------------- // OPENCL KERNEL MACROS - SUBGROUPS // ------------------------------------------------------------------------- #ifdef USE_OPENCL_SUBGROUPS #ifndef cl_intel_subgroups #pragma OPENCL EXTENSION cl_khr_subgroups : enable #endif #define simdsync() sub_group_barrier(CLK_LOCAL_MEM_FENCE) #define simd_size() get_max_sub_group_size() #else #define simdsync() #define simd_size() SIMD_SIZE #endif // ------------------------------------------------------------------------- // END OPENCL DEFINITIONS // ------------------------------------------------------------------------- #endif // ------------------------------------------------------------------------- // ARCHITECTURE INDEPENDENT DEFINITIONS // ------------------------------------------------------------------------- #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 #define numtyp4 double4 #define acctyp double #define acctyp2 double2 #define acctyp4 double4 #endif #ifdef _SINGLE_DOUBLE #define numtyp float #define numtyp2 float2 #define numtyp4 float4 #define acctyp double #define acctyp2 double2 #define acctyp4 double4 #endif #ifndef numtyp #define numtyp float #define numtyp2 float2 #define numtyp4 float4 #define acctyp float #define acctyp2 float2 #define acctyp4 float4 #endif #define EWALD_F (numtyp)1.12837917 #define EWALD_P (numtyp)0.3275911 #define A1 (numtyp)0.254829592 #define A2 (numtyp)-0.284496736 #define A3 (numtyp)1.421413741 #define A4 (numtyp)-1.453152027 #define A5 (numtyp)1.061405429 #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF ucl_inline int sbmask(int j) { return j >> SBBITS & 3; }; // default to 32-bit smallint and other ints, 64-bit bigint: // same as defined in src/lmptype.h #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \ !defined(LAMMPS_SMALLBIG) #define LAMMPS_SMALLBIG #endif