Feb2021 GPU Package Update - GPU Package Files

This commit is contained in:
Michael Brown
2021-02-15 08:20:50 -08:00
parent 16004e8f45
commit e7e2d2323b
345 changed files with 13424 additions and 7708 deletions

View File

@ -1,9 +1,10 @@
// **************************************************************************
// preprocessor.cu
// preprocessor.h
// -------------------
// W. Michael Brown (ORNL)
// Nitin Dhamankar (Intel)
//
// Device code for CUDA-specific preprocessor definitions
// Device-side preprocessor definitions
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
@ -14,566 +15,136 @@
// ***************************************************************************/
//*************************************************************************
// Preprocessor Definitions
// Device Configuration Definitions
//
// Note: It is assumed that constants with the same names are defined with
// the same values in all files.
// For OpenCL, the configuration is a string (optionally controlled at
// runtime) where tokens specify the values below in order)
//
// ARCH
// Definition: Architecture number for accelerator
// CONFIG_ID:
// Definition: Unique ID for a configuration
// 100-199 for NVIDIA GPUs with CUDA / HIP
// 200-299 for NVIDIA GPUs with OpenCL
// 300-399 for AMD GPUs with HIP
// 400-499 for AMD GPUs with OpenCL
// 500-599 for Intel GPUs with OpenCL
// SIMD_SIZE:
// Definition: For CUDA this is the warp size.
// For AMD this is the wavefront size.
// For OpenCL < 2.1 this is the number of workitems
// guarenteed to have the same instruction pointer
// For OpenCL >= 2.1 this is the smallest expected subgroup
// size. Actually subgroup sizes are determined per kernel.
// MEM_THREADS
// Definition: Number of threads with sequential ids accessing memory
// simultaneously on multiprocessor
// WARP_SIZE:
// Definition: Number of threads guaranteed to be on the same instruction
// Definition: Number of elements in main memory transaction. Used in
// PPPM. If unknown, set to SIMD_SIZE.
// SHUFFLE_AVAIL
// Definition: Controls the use of instructions for horizontal vector
// operations. 0 disables and will increase shared memory
// usage. 1 enables for CUDA, HIP, and OpenCL >= 2.1 on
// NVIDIA and Intel devices.
// FAST_MATH
// Definition: 0: do not use -cl-fast-relaxed-math optimization flag or
// native transcendentals for OpenCL (fused multiply-add
// still enabled). For CUDA and HIP, this is controlled by
// the Makefile at compile time. 1: enable fast math opts
//
// THREADS_PER_ATOM
// Definition: Default number of threads assigned per atom for pair styles
// Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
// Definition: Default number of work items or CUDA threads assigned per
// per atom for pair styles
// Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE
// THREADS_PER_CHARGE
// Definition: Default number of threads assigned per atom for pair styles
// with charge
// Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
// PPPM_MAX_SPLINE
// Definition: Maximum order for splines in PPPM
// PPPM_BLOCK_1D
// Definition: Thread block size for PPPM kernels
// Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
// PPPM_BLOCK_1D%32==0
// Definition: Default number of work items or CUDA threads assigned per
// per atom for pair styles using charge
// Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE
// THREADS_PER_THREE
// Definition: Default number of work items or CUDA threads assigned per
// per atom for 3-body styles
// Restrictions: Must be power of 2; THREADS_PER_ATOM^2<=SIMD_SIZE
//
// BLOCK_PAIR
// Definition: Default thread block size for pair styles
// Restrictions:
// Definition: Default block size for pair styles
// Restrictions: Must be integer multiple of SIMD_SIZE
// BLOCK_BIO_PAIR
// Definition: Default block size for CHARMM styles
// Restrictions: Must be integer multiple of SIMD_SIZE
// BLOCK_ELLIPSE
// Definition: Default block size for ellipsoidal models and some 3-body
// styles
// Restrictions: Must be integer multiple of SIMD_SIZE
// PPPM_BLOCK_1D
// Definition: Default block size for PPPM kernels
// Restrictions: Must be integer multiple of SIMD_SIZE
// BLOCK_NBOR_BUILD
// Definition: Default block size for neighbor list builds
// Restrictions: Must be integer multiple of SIMD_SIZE
// BLOCK_CELL_2D
// Definition: Default block size in each dimension for matrix transpose
// BLOCK_CELL_ID
// Definition: Unused in current implementation; Maintained for legacy
// purposes and specialized builds
//
// MAX_SHARED_TYPES 8
// Definition: Max # of atom type params can be stored in shared memory
// Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
// BLOCK_CELL_2D
// Definition: Default block size in each dimension for cell list builds
// and matrix transpose
// BLOCK_CELL_ID
// Definition: Default block size for binning atoms in cell list builds
// BLOCK_NBOR_BUILD
// Definition: Default block size for neighbor list builds
// BLOCK_BIO_PAIR
// Definition: Default thread block size for "bio" pair styles
// MAX_BIO_SHARED_TYPES
// Definition: Max # of atom type params can be stored in shared memory
// Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
// Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
// PPPM_MAX_SPLINE
// Definition: Maximum order for splines in PPPM
// Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
//
//*************************************************************************/
#define _texture(name, type) texture<type> name
#define _texture_2d(name, type) texture<type,1> name
// -------------------------------------------------------------------------
// HIP DEFINITIONS
// CUDA and HIP DEFINITIONS
// -------------------------------------------------------------------------
#ifdef USE_HIP
#include <hip/hip_runtime.h>
#ifdef __HIP_PLATFORM_HCC__
#define mul24(x, y) __mul24(x, y)
#undef _texture
#undef _texture_2d
#define _texture(name, type) __device__ type* name
#define _texture_2d(name, type) __device__ type* name
#endif
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_SIZE_X blockDim.x
#define BLOCK_SIZE_Y blockDim.y
#define __kernel extern "C" __global__
#ifdef __local
#undef __local
#endif
#define __local __shared__
#define __global
#define restrict __restrict__
#define atom_add atomicAdd
#define ucl_inline static __inline__ __device__
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_NBOR_BUILD 128
#define BLOCK_PAIR 256
#define BLOCK_BIO_PAIR 256
#define BLOCK_ELLIPSE 128
#define MAX_SHARED_TYPES 11
#ifdef _SINGLE_SINGLE
ucl_inline double shfl_xor(double var, int laneMask, int width) {
#ifdef __HIP_PLATFORM_HCC__
return __shfl_xor(var, laneMask, width);
#else
return __shfl_xor_sync(0xffffffff, var, laneMask, width);
#endif
}
#else
ucl_inline double shfl_xor(double var, int laneMask, int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
#ifdef __HIP_PLATFORM_HCC__
tmp.x = __shfl_xor(tmp.x,laneMask,width);
tmp.y = __shfl_xor(tmp.y,laneMask,width);
#else
tmp.x = __shfl_xor_sync(0xffffffff, tmp.x,laneMask,width);
tmp.y = __shfl_xor_sync(0xffffffff, tmp.y,laneMask,width);
#endif
return __hiloint2double(tmp.x,tmp.y);
}
#endif
#ifdef __HIP_PLATFORM_HCC__
#define ARCH 600
#define WARP_SIZE 64
#endif
#ifdef __HIP_PLATFORM_NVCC__
#define ARCH __CUDA_ARCH__
#define WARP_SIZE 32
#endif
#define fast_mul(X,Y) (X)*(Y)
#define MEM_THREADS WARP_SIZE
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#ifdef __HIP_PLATFORM_NVCC__
#ifdef _DOUBLE_DOUBLE
#define fetch4(ans,i,pos_tex) { \
int4 xy = tex1Dfetch(pos_tex,i*2); \
int4 zt = tex1Dfetch(pos_tex,i*2+1); \
ans.x=__hiloint2double(xy.y, xy.x); \
ans.y=__hiloint2double(xy.w, xy.z); \
ans.z=__hiloint2double(zt.y, zt.x); \
ans.w=__hiloint2double(zt.w, zt.z); \
}
#define fetch(ans,i,q_tex) { \
int2 qt = tex1Dfetch(q_tex,i); \
ans=__hiloint2double(qt.y, qt.x); \
}
#else
#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
#endif
#else
#ifdef _DOUBLE_DOUBLE
#define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
#define fetch(ans,i,q_tex) (ans=*(((double *) q_tex) + i))
#else
#define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
#define fetch(ans,i,q_tex) (ans=*(((float *) q_tex) + i))
#endif
#endif
#ifdef _DOUBLE_DOUBLE
#define ucl_exp exp
#define ucl_powr pow
#define ucl_atan atan
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))
#else
#define ucl_atan atanf
#define ucl_cbrt cbrtf
#define ucl_ceil ceilf
#define ucl_abs fabsf
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#ifdef NO_HARDWARE_TRANSCENDENTALS
#define ucl_exp expf
#define ucl_powr powf
#else
#define ucl_exp __expf
#define ucl_powr __powf
#endif
#endif
#endif
// -------------------------------------------------------------------------
// CUDA DEFINITIONS
// -------------------------------------------------------------------------
#ifdef NV_KERNEL
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_SIZE_X blockDim.x
#define BLOCK_SIZE_Y blockDim.y
#define __kernel extern "C" __global__
#define __local __shared__
#define __global
#define restrict __restrict__
#define atom_add atomicAdd
#define ucl_inline static __inline__ __device__
#ifdef __CUDA_ARCH__
#define ARCH __CUDA_ARCH__
#else
#define ARCH 100
#endif
#if (ARCH < 200)
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 16
#define BLOCK_NBOR_BUILD 64
#define BLOCK_PAIR 64
#define BLOCK_BIO_PAIR 64
#define MAX_SHARED_TYPES 8
#else
#if (ARCH < 300)
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_NBOR_BUILD 128
#define BLOCK_PAIR 128
#define BLOCK_BIO_PAIR 128
#define MAX_SHARED_TYPES 8
#else
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_NBOR_BUILD 128
#define BLOCK_PAIR 256
#define BLOCK_BIO_PAIR 256
#define BLOCK_ELLIPSE 128
#define MAX_SHARED_TYPES 11
#if (__CUDACC_VER_MAJOR__ < 9)
#ifdef _SINGLE_SINGLE
#define shfl_xor __shfl_xor
#else
ucl_inline double shfl_xor(double var, int laneMask, int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
tmp.x = __shfl_xor(tmp.x,laneMask,width);
tmp.y = __shfl_xor(tmp.y,laneMask,width);
return __hiloint2double(tmp.x,tmp.y);
}
#endif
#else
#ifdef _SINGLE_SINGLE
ucl_inline double shfl_xor(double var, int laneMask, int width) {
return __shfl_xor_sync(0xffffffff, var, laneMask, width);
}
#else
ucl_inline double shfl_xor(double var, int laneMask, int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,laneMask,width);
tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,laneMask,width);
return __hiloint2double(tmp.x,tmp.y);
}
#endif
#endif
#endif
#endif
#define WARP_SIZE 32
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#ifdef _DOUBLE_DOUBLE
#define fetch4(ans,i,pos_tex) { \
int4 xy = tex1Dfetch(pos_tex,i*2); \
int4 zt = tex1Dfetch(pos_tex,i*2+1); \
ans.x=__hiloint2double(xy.y, xy.x); \
ans.y=__hiloint2double(xy.w, xy.z); \
ans.z=__hiloint2double(zt.y, zt.x); \
ans.w=__hiloint2double(zt.w, zt.z); \
}
#define fetch(ans,i,q_tex) { \
int2 qt = tex1Dfetch(q_tex,i); \
ans=__hiloint2double(qt.y, qt.x); \
}
#else
#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
#endif
#if (__CUDA_ARCH__ < 200)
#define fast_mul __mul24
#define MEM_THREADS 16
#else
#define fast_mul(X,Y) (X)*(Y)
#define MEM_THREADS 32
#endif
#ifdef CUDA_PRE_THREE
struct __builtin_align__(16) _double4
{
double x, y, z, w;
};
typedef struct _double4 double4;
#endif
#ifdef _DOUBLE_DOUBLE
#define ucl_exp exp
#define ucl_powr pow
#define ucl_atan atan
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))
#else
#define ucl_atan atanf
#define ucl_cbrt cbrtf
#define ucl_ceil ceilf
#define ucl_abs fabsf
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#ifdef NO_HARDWARE_TRANSCENDENTALS
#define ucl_exp expf
#define ucl_powr powf
#else
#define ucl_exp __expf
#define ucl_powr __powf
#endif
#endif
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_pre_cuda_hip.h"
#endif
// -------------------------------------------------------------------------
// NVIDIA GENERIC OPENCL DEFINITIONS
// OPENCL DEVICE CONFIGURATAIONS
// -------------------------------------------------------------------------
#ifdef NV_GENERIC_OCL
// See lal_pre_ocl_config.h for OpenCL device configurations
#if !defined(NV_KERNEL) && !defined(USE_HIP)
#define USE_OPENCL
#define fast_mul mul24
#define MEM_THREADS 16
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#define BLOCK_NBOR_BUILD 64
#define BLOCK_BIO_PAIR 64
#define WARP_SIZE 32
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#endif
// -------------------------------------------------------------------------
// NVIDIA FERMI OPENCL DEFINITIONS
// OPENCL KERNEL MACROS
// -------------------------------------------------------------------------
#ifdef FERMI_OCL
#define USE_OPENCL
#define MEM_THREADS 32
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_PAIR 128
#define MAX_SHARED_TYPES 11
#define BLOCK_NBOR_BUILD 128
#define BLOCK_BIO_PAIR 128
#define WARP_SIZE 32
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#endif
// -------------------------------------------------------------------------
// NVIDIA KEPLER OPENCL DEFINITIONS
// -------------------------------------------------------------------------
#ifdef KEPLER_OCL
#define USE_OPENCL
#define MEM_THREADS 32
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_PAIR 256
#define MAX_SHARED_TYPES 11
#define BLOCK_NBOR_BUILD 128
#define BLOCK_BIO_PAIR 256
#define BLOCK_ELLIPSE 128
#define WARP_SIZE 32
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#ifndef NO_OCL_PTX
#define ARCH 300
#ifdef _SINGLE_SINGLE
inline float shfl_xor(float var, int laneMask, int width) {
float ret;
int c;
c = ((WARP_SIZE-width) << 8) | 0x1f;
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
return ret;
}
#if (__OPENCL_VERSION__ > 199)
#define NOUNROLL __attribute__((opencl_unroll_hint(1)))
#else
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
inline double shfl_xor(double var, int laneMask, int width) {
int c = ((WARP_SIZE-width) << 8) | 0x1f;
int x,y,x2,y2;
double ans;
asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
return ans;
}
#endif
#define NOUNROLL
#endif
#endif
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define GLOBAL_SIZE_X get_global_size(0)
#define THREAD_ID_Y get_local_id(1)
#define BLOCK_ID_Y get_group_id(1)
#define NUM_BLOCKS_X get_num_groups(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define ucl_inline inline
// -------------------------------------------------------------------------
// AMD CYPRESS OPENCL DEFINITIONS
// OPENCL KERNEL MACROS - TEXTURES
// -------------------------------------------------------------------------
#ifdef CYPRESS_OCL
#define USE_OPENCL
#define MEM_THREADS 32
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_PAIR 128
#define MAX_SHARED_TYPES 8
#define BLOCK_NBOR_BUILD 64
#define BLOCK_BIO_PAIR 64
#define WARP_SIZE 64
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#endif
#define fetch4(ans,i,x) ans=x[i]
#define fetch(ans,i,q) ans=q[i]
// -------------------------------------------------------------------------
// INTEL CPU OPENCL DEFINITIONS
// OPENCL KERNEL MACROS - MATH
// -------------------------------------------------------------------------
#ifdef INTEL_OCL
#define USE_OPENCL
#define MEM_THREADS 16
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1
#define BLOCK_PAIR 1
#define MAX_SHARED_TYPES 0
#define BLOCK_NBOR_BUILD 4
#define BLOCK_BIO_PAIR 2
#define BLOCK_ELLIPSE 2
#define WARP_SIZE 1
#define PPPM_BLOCK_1D 32
#define BLOCK_CELL_2D 1
#define BLOCK_CELL_ID 2
#define MAX_BIO_SHARED_TYPES 0
#endif
// -------------------------------------------------------------------------
// INTEL PHI OPENCL DEFINITIONS
// -------------------------------------------------------------------------
#ifdef PHI_OCL
#define USE_OPENCL
#define MEM_THREADS 16
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1
#define BLOCK_PAIR 16
#define MAX_SHARED_TYPES 0
#define BLOCK_NBOR_BUILD 16
#define BLOCK_BIO_PAIR 16
#define BLOCK_ELLIPSE 16
#define WARP_SIZE 1
#define PPPM_BLOCK_1D 32
#define BLOCK_CELL_2D 4
#define BLOCK_CELL_ID 16
#define MAX_BIO_SHARED_TYPES 0
#endif
// -------------------------------------------------------------------------
// GENERIC OPENCL DEFINITIONS
// -------------------------------------------------------------------------
#ifdef GENERIC_OCL
#define USE_OPENCL
#define MEM_THREADS 16
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#define BLOCK_NBOR_BUILD 64
#define BLOCK_BIO_PAIR 64
#define WARP_SIZE 1
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#endif
// -------------------------------------------------------------------------
// OPENCL Stuff for All Hardware
// -------------------------------------------------------------------------
#ifdef USE_OPENCL
#ifndef _SINGLE_SINGLE
#ifndef cl_khr_fp64
@ -589,48 +160,14 @@ inline double shfl_xor(double var, int laneMask, int width) {
#endif
#ifndef fast_mul
#define fast_mul(X,Y) (X)*(Y)
#endif
#ifndef ARCH
#define ARCH 0
#endif
#ifndef DRIVER
#define DRIVER 0
#endif
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define GLOBAL_SIZE_X get_global_size(0)
#define THREAD_ID_Y get_local_id(1)
#define BLOCK_ID_Y get_group_id(1)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define ucl_inline inline
#define fetch4(ans,i,x) ans=x[i]
#define fetch(ans,i,q) ans=q[i]
#define ucl_atan atan
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#ifdef _DOUBLE_DOUBLE
#define NO_HARDWARE_TRANSCENDENTALS
#endif
#ifdef NO_HARDWARE_TRANSCENDENTALS
#define ucl_exp exp
#define ucl_powr powr
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))
#else
#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
#define ucl_exp native_exp
#define ucl_powr native_powr
@ -638,23 +175,128 @@ inline double shfl_xor(double var, int laneMask, int width) {
#define ucl_sqrt native_sqrt
#define ucl_recip native_recip
#else
#define ucl_exp exp
#define ucl_powr powr
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))
#endif
// -------------------------------------------------------------------------
// OPENCL KERNEL MACROS - SHUFFLE
// -------------------------------------------------------------------------
#if (SHUFFLE_AVAIL == 1)
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#define shfl_down(var, delta, width) \
intel_sub_group_shuffle_down(var, var, delta)
#define shfl_xor(var, lanemask, width) \
intel_sub_group_shuffle_xor(var, lanemask)
#define simd_broadcast_i(var, src, width) sub_group_broadcast(var, src)
#define simd_broadcast_f(var, src, width) sub_group_broadcast(var, src)
#define simd_broadcast_d(var, src, width) sub_group_broadcast(var, src)
#else
#ifdef _SINGLE_SINGLE
inline float shfl_down(float var, unsigned int delta, int width) {
float ret;
int c;
c = ((SIMD_SIZE-width) << 8) | 0x1f;
asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c), "r"(0xffffffff));
return ret;
}
inline float shfl_xor(float var, unsigned int lanemask, int width) {
float ret;
int c;
c = ((SIMD_SIZE-width) << 8) | 0x1f;
asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(lanemask), "r"(c), "r"(0xffffffff));
return ret;
}
#else
inline double shfl_down(double var, unsigned int delta, int width) {
int c = ((SIMD_SIZE-width) << 8) | 0x1f;
int x,y,x2,y2;
double ans;
asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(delta), "r"(c), "r"(0xffffffff));
asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(delta), "r"(c), "r"(0xffffffff));
asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
return ans;
}
inline double shfl_xor(double var, unsigned int lanemask, int width) {
int c = ((SIMD_SIZE-width) << 8) | 0x1f;
int x,y,x2,y2;
double ans;
asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(lanemask), "r"(c), "r"(0xffffffff));
asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(lanemask), "r"(c), "r"(0xffffffff));
asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
return ans;
}
#endif
inline int simd_broadcast_i(int var, unsigned int src, int width) {
int ret;
int c;
c = ((SIMD_SIZE-width) << 8) | 0x1f;
asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff));
return ret;
}
inline float simd_broadcast_f(float var, unsigned int src, int width) {
float ret;
int c;
c = ((SIMD_SIZE-width) << 8) | 0x1f;
asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff));
return ret;
}
#ifdef _DOUBLE_DOUBLE
inline double simd_broadcast_d(double var, unsigned int src, int width) {
int c = ((SIMD_SIZE-width) << 8) | 0x1f;
int x,y,x2,y2;
double ans;
asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(src), "r"(c), "r"(0xffffffff));
asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(src), "r"(c), "r"(0xffffffff));
asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
return ans;
}
#endif
#endif
#endif
// -------------------------------------------------------------------------
// OPENCL KERNEL MACROS - SUBGROUPS
// -------------------------------------------------------------------------
#ifdef USE_OPENCL_SUBGROUPS
#ifndef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#define simdsync() sub_group_barrier(CLK_LOCAL_MEM_FENCE)
#define simd_size() get_max_sub_group_size()
#else
#define simdsync()
#define simd_size() SIMD_SIZE
#endif
// -------------------------------------------------------------------------
// END OPENCL DEFINITIONS
// -------------------------------------------------------------------------
#endif
// -------------------------------------------------------------------------
// ARCHITECTURE INDEPENDENT DEFINITIONS
// -------------------------------------------------------------------------
#ifndef PPPM_MAX_SPLINE
#define PPPM_MAX_SPLINE 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp2 double2
#define acctyp4 double4
#endif
@ -663,6 +305,7 @@ inline double shfl_xor(double var, int laneMask, int width) {
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp2 double2
#define acctyp4 double4
#endif
@ -671,6 +314,7 @@ inline double shfl_xor(double var, int laneMask, int width) {
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp2 float2
#define acctyp4 float4
#endif
@ -686,11 +330,9 @@ inline double shfl_xor(double var, int laneMask, int width) {
#define NEIGHMASK 0x3FFFFFFF
ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
#ifndef BLOCK_ELLIPSE
#define BLOCK_ELLIPSE BLOCK_PAIR
#endif
// default to 32-bit smallint and other ints, 64-bit bigint: same as defined in src/lmptype.h
#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && !defined(LAMMPS_SMALLBIG)
// default to 32-bit smallint and other ints, 64-bit bigint:
// same as defined in src/lmptype.h
#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
!defined(LAMMPS_SMALLBIG)
#define LAMMPS_SMALLBIG
#endif