Feb2021 GPU Package Update - GPU Package Files

2021-02-15 08:20:50 -08:00
parent 16004e8f45
commit e7e2d2323b
345 changed files with 13424 additions and 7708 deletions
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -1,9 +1,10 @@
 // **************************************************************************
-//                              preprocessor.cu
+//                               preprocessor.h
 //                             -------------------
 //                           W. Michael Brown (ORNL)
+//                           Nitin Dhamankar (Intel)
 //
-//  Device code for CUDA-specific preprocessor definitions
+//  Device-side preprocessor definitions
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
@ -14,566 +15,136 @@
 // ***************************************************************************/

 //*************************************************************************
-//                           Preprocessor Definitions
+//                       Device Configuration Definitions
 //
-//  Note: It is assumed that constants with the same names are defined with
-//  the same values in all files.
+//  For OpenCL, the configuration is a string (optionally controlled at
+//  runtime) where tokens specify the values below in order)
 //
-//  ARCH
-//     Definition:   Architecture number for accelerator
+//  CONFIG_ID:
+//     Definition:   Unique ID for a configuration
+//                   100-199 for NVIDIA GPUs with CUDA / HIP
+//                   200-299 for NVIDIA GPUs with OpenCL
+//                   300-399 for AMD GPUs with HIP
+//                   400-499 for AMD GPUs with OpenCL
+//                   500-599 for Intel GPUs with OpenCL
+//  SIMD_SIZE:
+//     Definition:   For CUDA this is the warp size.
+//                   For AMD this is the wavefront size.
+//                   For OpenCL < 2.1 this is the number of workitems
+//                     guarenteed to have the same instruction pointer
+//                   For OpenCL >= 2.1 this is the smallest expected subgroup
+//                     size. Actually subgroup sizes are determined per kernel.
 //  MEM_THREADS
-//     Definition:   Number of threads with sequential ids accessing memory
-//                   simultaneously on multiprocessor
-//  WARP_SIZE:
-//     Definition:   Number of threads guaranteed to be on the same instruction
+//     Definition:   Number of elements in main memory transaction. Used in
+//                   PPPM. If unknown, set to SIMD_SIZE.
+//  SHUFFLE_AVAIL
+//     Definition:   Controls the use of instructions for horizontal vector
+//                   operations. 0 disables and will increase shared memory
+//                   usage. 1 enables for CUDA, HIP, and OpenCL >= 2.1 on
+//                   NVIDIA and Intel devices.
+//  FAST_MATH
+//     Definition:   0: do not use -cl-fast-relaxed-math optimization flag or
+//                   native transcendentals for OpenCL (fused multiply-add
+//                   still enabled). For CUDA and HIP, this is controlled by
+//                   the Makefile at compile time. 1: enable fast math opts
+//
 //  THREADS_PER_ATOM
-//     Definition:   Default number of threads assigned per atom for pair styles
-//     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
+//     Definition:   Default number of work items or CUDA threads assigned per
+//                   per atom for pair styles
+//     Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE
 //  THREADS_PER_CHARGE
-//     Definition:   Default number of threads assigned per atom for pair styles
-//                   with charge
-//     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
-//  PPPM_MAX_SPLINE
-//     Definition:   Maximum order for splines in PPPM
-//  PPPM_BLOCK_1D
-//     Definition:   Thread block size for PPPM kernels
-//     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
-//                   PPPM_BLOCK_1D%32==0
+//     Definition:   Default number of work items or CUDA threads assigned per
+//                   per atom for pair styles using charge
+//     Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE
+//  THREADS_PER_THREE
+//     Definition:   Default number of work items or CUDA threads assigned per
+//                   per atom for 3-body styles
+//     Restrictions: Must be power of 2; THREADS_PER_ATOM^2<=SIMD_SIZE
+//
 //  BLOCK_PAIR
-//     Definition:   Default thread block size for pair styles
-//     Restrictions:
+//     Definition:   Default block size for pair styles
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_BIO_PAIR
+//     Definition:   Default block size for CHARMM styles
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_ELLIPSE
+//     Definition:   Default block size for ellipsoidal models and some 3-body
+//                   styles
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  PPPM_BLOCK_1D
+//     Definition:   Default block size for PPPM kernels
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_NBOR_BUILD
+//     Definition:   Default block size for neighbor list builds
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_CELL_2D
+//     Definition:   Default block size in each dimension for matrix transpose
+//  BLOCK_CELL_ID
+//     Definition:   Unused in current implementation; Maintained for legacy
+//                   purposes and specialized builds
+//
 //  MAX_SHARED_TYPES 8
 //     Definition:   Max # of atom type params can be stored in shared memory
 //     Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
-//  BLOCK_CELL_2D
-//     Definition:   Default block size in each dimension for cell list builds
-//                   and matrix transpose
-//  BLOCK_CELL_ID
-//     Definition:   Default block size for binning atoms in cell list builds
-//  BLOCK_NBOR_BUILD
-//     Definition:   Default block size for neighbor list builds
-//  BLOCK_BIO_PAIR
-//     Definition:   Default thread block size for "bio" pair styles
 //  MAX_BIO_SHARED_TYPES
 //     Definition:   Max # of atom type params can be stored in shared memory
-//     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
+//     Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
+//  PPPM_MAX_SPLINE
+//     Definition:   Maximum order for splines in PPPM
+//     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
 //
 //*************************************************************************/

-#define _texture(name, type)  texture<type> name
-#define _texture_2d(name, type) texture<type,1> name
-
 // -------------------------------------------------------------------------
-//                            HIP DEFINITIONS
+//                           CUDA and HIP DEFINITIONS
 // -------------------------------------------------------------------------

-#ifdef USE_HIP
-  #include <hip/hip_runtime.h>
-  #ifdef __HIP_PLATFORM_HCC__
-    #define mul24(x, y) __mul24(x, y)
-    #undef _texture
-    #undef _texture_2d
-    #define _texture(name, type)  __device__ type* name
-    #define _texture_2d(name, type)  __device__ type* name
-  #endif
-  #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
-  #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
-  #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
-  #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
-  #define THREAD_ID_X threadIdx.x
-  #define THREAD_ID_Y threadIdx.y
-  #define BLOCK_ID_X blockIdx.x
-  #define BLOCK_ID_Y blockIdx.y
-  #define BLOCK_SIZE_X blockDim.x
-  #define BLOCK_SIZE_Y blockDim.y
-  #define __kernel extern "C" __global__
-  #ifdef __local
-    #undef __local
-  #endif
-  #define __local __shared__
-  #define __global
-  #define restrict __restrict__
-  #define atom_add atomicAdd
-  #define ucl_inline static __inline__ __device__
-
-  #define THREADS_PER_ATOM 4
-  #define THREADS_PER_CHARGE 8
-  #define BLOCK_NBOR_BUILD 128
-  #define BLOCK_PAIR 256
-  #define BLOCK_BIO_PAIR 256
-  #define BLOCK_ELLIPSE 128
-  #define MAX_SHARED_TYPES 11
-
-  #ifdef _SINGLE_SINGLE
-    ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  #ifdef __HIP_PLATFORM_HCC__
-      return __shfl_xor(var, laneMask, width);
-  #else
-      return __shfl_xor_sync(0xffffffff, var, laneMask, width);
-  #endif
-    }
-  #else
-    ucl_inline double shfl_xor(double var, int laneMask, int width) {
-      int2 tmp;
-      tmp.x = __double2hiint(var);
-      tmp.y = __double2loint(var);
-  #ifdef __HIP_PLATFORM_HCC__
-      tmp.x = __shfl_xor(tmp.x,laneMask,width);
-      tmp.y = __shfl_xor(tmp.y,laneMask,width);
-  #else
-      tmp.x = __shfl_xor_sync(0xffffffff, tmp.x,laneMask,width);
-      tmp.y = __shfl_xor_sync(0xffffffff, tmp.y,laneMask,width);
-  #endif
-      return __hiloint2double(tmp.x,tmp.y);
-    }
-  #endif
-
-  #ifdef __HIP_PLATFORM_HCC__
-    #define ARCH 600
-    #define WARP_SIZE 64
-  #endif
-
-  #ifdef __HIP_PLATFORM_NVCC__
-    #define ARCH __CUDA_ARCH__
-    #define WARP_SIZE 32
-  #endif
-
-  #define fast_mul(X,Y) (X)*(Y)
-
-  #define MEM_THREADS WARP_SIZE
-  #define PPPM_BLOCK_1D 64
-  #define BLOCK_CELL_2D 8
-  #define BLOCK_CELL_ID 128
-  #define MAX_BIO_SHARED_TYPES 128
-
-  #ifdef __HIP_PLATFORM_NVCC__
-    #ifdef _DOUBLE_DOUBLE
-      #define fetch4(ans,i,pos_tex) {                        \
-        int4 xy = tex1Dfetch(pos_tex,i*2);                   \
-        int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
-        ans.x=__hiloint2double(xy.y, xy.x);                  \
-        ans.y=__hiloint2double(xy.w, xy.z);                  \
-        ans.z=__hiloint2double(zt.y, zt.x);                  \
-        ans.w=__hiloint2double(zt.w, zt.z);                  \
-      }
-      #define fetch(ans,i,q_tex) {                           \
-        int2 qt = tex1Dfetch(q_tex,i);                       \
-        ans=__hiloint2double(qt.y, qt.x);                    \
-      }
-    #else
-      #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
-      #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
-    #endif
-  #else
-    #ifdef _DOUBLE_DOUBLE
-      #define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
-      #define fetch(ans,i,q_tex)    (ans=*(((double *)  q_tex) + i))
-    #else
-      #define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
-      #define fetch(ans,i,q_tex)    (ans=*(((float *)  q_tex) + i))
-    #endif
-  #endif
-
-  #ifdef _DOUBLE_DOUBLE
-    #define ucl_exp exp
-    #define ucl_powr pow
-    #define ucl_atan atan
-    #define ucl_cbrt cbrt
-    #define ucl_ceil ceil
-    #define ucl_abs fabs
-    #define ucl_rsqrt rsqrt
-    #define ucl_sqrt sqrt
-    #define ucl_recip(x) ((numtyp)1.0/(x))
-
-  #else
-    #define ucl_atan atanf
-    #define ucl_cbrt cbrtf
-    #define ucl_ceil ceilf
-    #define ucl_abs fabsf
-    #define ucl_recip(x) ((numtyp)1.0/(x))
-    #define ucl_rsqrt rsqrtf
-    #define ucl_sqrt sqrtf
-
-    #ifdef NO_HARDWARE_TRANSCENDENTALS
-      #define ucl_exp expf
-      #define ucl_powr powf
-    #else
-      #define ucl_exp __expf
-      #define ucl_powr __powf
-    #endif
-  #endif
-#endif
-    
-// -------------------------------------------------------------------------
-//                            CUDA DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef NV_KERNEL
-
-#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
-#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
-#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
-#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
-#define THREAD_ID_X threadIdx.x
-#define THREAD_ID_Y threadIdx.y
-#define BLOCK_ID_X blockIdx.x
-#define BLOCK_ID_Y blockIdx.y
-#define BLOCK_SIZE_X blockDim.x
-#define BLOCK_SIZE_Y blockDim.y
-#define __kernel extern "C" __global__
-#define __local __shared__
-#define __global
-#define restrict __restrict__
-#define atom_add atomicAdd
-#define ucl_inline static __inline__ __device__
-
-#ifdef __CUDA_ARCH__
-#define ARCH __CUDA_ARCH__
-#else
-#define ARCH 100
-#endif
-
-#if (ARCH < 200)
-
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 16
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_PAIR 64
-#define BLOCK_BIO_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#else
-
-#if (ARCH < 300)
-
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_PAIR 128
-#define BLOCK_BIO_PAIR 128
-#define MAX_SHARED_TYPES 8
-
-#else
-
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_PAIR 256
-#define BLOCK_BIO_PAIR 256
-#define BLOCK_ELLIPSE 128
-#define MAX_SHARED_TYPES 11
-
-#if (__CUDACC_VER_MAJOR__ < 9)
-
-#ifdef _SINGLE_SINGLE
-#define shfl_xor __shfl_xor
-#else
-ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  int2 tmp;
-  tmp.x = __double2hiint(var);
-  tmp.y = __double2loint(var);
-  tmp.x = __shfl_xor(tmp.x,laneMask,width);
-  tmp.y = __shfl_xor(tmp.y,laneMask,width);
-  return __hiloint2double(tmp.x,tmp.y);
-}
-#endif
-
-#else
-
-#ifdef _SINGLE_SINGLE
-ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  return __shfl_xor_sync(0xffffffff, var, laneMask, width);
-}
-#else
-ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  int2 tmp;
-  tmp.x = __double2hiint(var);
-  tmp.y = __double2loint(var);
-  tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,laneMask,width);
-  tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,laneMask,width);
-  return __hiloint2double(tmp.x,tmp.y);
-}
-#endif
-
-#endif
-
-#endif
-
-#endif
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#ifdef _DOUBLE_DOUBLE
-#define fetch4(ans,i,pos_tex) {                        \
-  int4 xy = tex1Dfetch(pos_tex,i*2);                   \
-  int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
-  ans.x=__hiloint2double(xy.y, xy.x);                  \
-  ans.y=__hiloint2double(xy.w, xy.z);                  \
-  ans.z=__hiloint2double(zt.y, zt.x);                  \
-  ans.w=__hiloint2double(zt.w, zt.z);                  \
-}
-#define fetch(ans,i,q_tex) {                           \
-  int2 qt = tex1Dfetch(q_tex,i);                       \
-  ans=__hiloint2double(qt.y, qt.x);                    \
-}
-#else
-#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
-#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
-#endif
-
-#if (__CUDA_ARCH__ < 200)
-#define fast_mul __mul24
-#define MEM_THREADS 16
-#else
-#define fast_mul(X,Y) (X)*(Y)
-#define MEM_THREADS 32
-#endif
-
-#ifdef CUDA_PRE_THREE
-struct __builtin_align__(16) _double4
-{
-  double x, y, z, w;
-};
-typedef struct _double4 double4;
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-
-#define ucl_exp exp
-#define ucl_powr pow
-#define ucl_atan atan
-#define ucl_cbrt cbrt
-#define ucl_ceil ceil
-#define ucl_abs fabs
-#define ucl_rsqrt rsqrt
-#define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
-
-#else
-
-#define ucl_atan atanf
-#define ucl_cbrt cbrtf
-#define ucl_ceil ceilf
-#define ucl_abs fabsf
-#define ucl_recip(x) ((numtyp)1.0/(x))
-#define ucl_rsqrt rsqrtf
-#define ucl_sqrt sqrtf
-
-#ifdef NO_HARDWARE_TRANSCENDENTALS
-
-#define ucl_exp expf
-#define ucl_powr powf
-
-#else
-
-#define ucl_exp __expf
-#define ucl_powr __powf
-
-#endif
-
-#endif
-
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include "lal_pre_cuda_hip.h"
 #endif

 // -------------------------------------------------------------------------
-//                            NVIDIA GENERIC OPENCL DEFINITIONS
+//                         OPENCL DEVICE CONFIGURATAIONS
 // -------------------------------------------------------------------------

-#ifdef NV_GENERIC_OCL
+// See lal_pre_ocl_config.h for OpenCL device configurations
+
+#if !defined(NV_KERNEL) && !defined(USE_HIP)

 #define USE_OPENCL
-#define fast_mul mul24
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_BIO_PAIR 64
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif

 // -------------------------------------------------------------------------
-//                           NVIDIA FERMI OPENCL DEFINITIONS
+//                         OPENCL KERNEL MACROS
 // -------------------------------------------------------------------------

-#ifdef FERMI_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 32
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_PAIR 128
-#define MAX_SHARED_TYPES 11
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_BIO_PAIR 128
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
-
-// -------------------------------------------------------------------------
-//                           NVIDIA KEPLER OPENCL DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef KEPLER_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 32
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_PAIR 256
-#define MAX_SHARED_TYPES 11
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_BIO_PAIR 256
-#define BLOCK_ELLIPSE 128
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#ifndef NO_OCL_PTX
-#define ARCH 300
-#ifdef _SINGLE_SINGLE
-inline float shfl_xor(float var, int laneMask, int width) {
-  float ret;
-  int c;
-  c = ((WARP_SIZE-width) << 8) | 0x1f;
-  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
-  return ret;
-}
+#if (__OPENCL_VERSION__ > 199)
+#define NOUNROLL __attribute__((opencl_unroll_hint(1)))
 #else
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-inline double shfl_xor(double var, int laneMask, int width) {
-  int c = ((WARP_SIZE-width) << 8) | 0x1f;
-  int x,y,x2,y2;
-  double ans;
-  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
-  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
-  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
-  asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
-  return ans;
-}
-#endif
+#define NOUNROLL
 #endif

-#endif
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define GLOBAL_SIZE_X get_global_size(0)
+#define THREAD_ID_Y get_local_id(1)
+#define BLOCK_ID_Y get_group_id(1)
+#define NUM_BLOCKS_X get_num_groups(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define ucl_inline inline

 // -------------------------------------------------------------------------
-//                            AMD CYPRESS OPENCL DEFINITIONS
+//                      OPENCL KERNEL MACROS - TEXTURES
 // -------------------------------------------------------------------------

-#ifdef CYPRESS_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 32
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_PAIR 128
-#define MAX_SHARED_TYPES 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_BIO_PAIR 64
-
-#define WARP_SIZE 64
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
+#define fetch4(ans,i,x) ans=x[i]
+#define fetch(ans,i,q) ans=q[i]

 // -------------------------------------------------------------------------
-//                           INTEL CPU OPENCL DEFINITIONS
+//                       OPENCL KERNEL MACROS - MATH
 // -------------------------------------------------------------------------

-#ifdef INTEL_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 1
-#define MAX_SHARED_TYPES 0
-#define BLOCK_NBOR_BUILD 4
-#define BLOCK_BIO_PAIR 2
-#define BLOCK_ELLIPSE 2
-
-#define WARP_SIZE 1
-#define PPPM_BLOCK_1D 32
-#define BLOCK_CELL_2D 1
-#define BLOCK_CELL_ID 2
-#define MAX_BIO_SHARED_TYPES 0
-
-#endif
-
-// -------------------------------------------------------------------------
-//                           INTEL PHI OPENCL DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef PHI_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 16
-#define MAX_SHARED_TYPES 0
-#define BLOCK_NBOR_BUILD 16
-#define BLOCK_BIO_PAIR 16
-#define BLOCK_ELLIPSE 16
-
-#define WARP_SIZE 1
-#define PPPM_BLOCK_1D 32
-#define BLOCK_CELL_2D 4
-#define BLOCK_CELL_ID 16
-#define MAX_BIO_SHARED_TYPES 0
-
-#endif
-
-// -------------------------------------------------------------------------
-//                            GENERIC OPENCL DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef GENERIC_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_BIO_PAIR 64
-
-#define WARP_SIZE 1
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
-
-// -------------------------------------------------------------------------
-//                     OPENCL Stuff for All Hardware
-// -------------------------------------------------------------------------
-#ifdef USE_OPENCL
-
 #ifndef _SINGLE_SINGLE

 #ifndef cl_khr_fp64
@ -589,48 +160,14 @@ inline double shfl_xor(double var, int laneMask, int width) {

 #endif

-#ifndef fast_mul
 #define fast_mul(X,Y) (X)*(Y)
-#endif
-
-#ifndef ARCH
-#define ARCH 0
-#endif
-
-#ifndef DRIVER
-#define DRIVER 0
-#endif
-
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define GLOBAL_SIZE_X get_global_size(0)
-#define THREAD_ID_Y get_local_id(1)
-#define BLOCK_ID_Y get_group_id(1)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define ucl_inline inline
-#define fetch4(ans,i,x) ans=x[i]
-#define fetch(ans,i,q) ans=q[i]

 #define ucl_atan atan
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs

-#ifdef _DOUBLE_DOUBLE
-#define NO_HARDWARE_TRANSCENDENTALS
-#endif
-
-#ifdef NO_HARDWARE_TRANSCENDENTALS
-
-#define ucl_exp exp
-#define ucl_powr powr
-#define ucl_rsqrt rsqrt
-#define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
-
-#else
+#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)

 #define ucl_exp native_exp
 #define ucl_powr native_powr
@ -638,23 +175,128 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define ucl_sqrt native_sqrt
 #define ucl_recip native_recip

+#else
+
+#define ucl_exp exp
+#define ucl_powr powr
+#define ucl_rsqrt rsqrt
+#define ucl_sqrt sqrt
+#define ucl_recip(x) ((numtyp)1.0/(x))
+
 #endif

+// -------------------------------------------------------------------------
+//                      OPENCL KERNEL MACROS - SHUFFLE
+// -------------------------------------------------------------------------
+
+#if (SHUFFLE_AVAIL == 1)
+  #ifdef cl_intel_subgroups
+    #pragma OPENCL EXTENSION cl_intel_subgroups : enable
+    #define shfl_down(var, delta, width) \
+      intel_sub_group_shuffle_down(var, var, delta)
+    #define shfl_xor(var, lanemask, width) \
+      intel_sub_group_shuffle_xor(var, lanemask)
+    #define simd_broadcast_i(var, src, width) sub_group_broadcast(var, src)
+    #define simd_broadcast_f(var, src, width) sub_group_broadcast(var, src)
+    #define simd_broadcast_d(var, src, width) sub_group_broadcast(var, src)
+  #else
+    #ifdef _SINGLE_SINGLE
+      inline float shfl_down(float var, unsigned int delta, int width) {
+        float ret;
+        int c;
+        c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c), "r"(0xffffffff));
+        return ret;
+      }
+      inline float shfl_xor(float var, unsigned int lanemask, int width) {
+        float ret;
+        int c;
+        c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(lanemask), "r"(c), "r"(0xffffffff));
+        return ret;
+      }
+    #else
+      inline double shfl_down(double var, unsigned int delta, int width) {
+        int c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        int x,y,x2,y2;
+        double ans;
+        asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+        asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(delta), "r"(c), "r"(0xffffffff));
+        asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(delta), "r"(c), "r"(0xffffffff));
+        asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+        return ans;
+      }
+      inline double shfl_xor(double var, unsigned int lanemask, int width) {
+        int c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        int x,y,x2,y2;
+        double ans;
+        asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+        asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(lanemask), "r"(c), "r"(0xffffffff));
+        asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(lanemask), "r"(c), "r"(0xffffffff));
+        asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+        return ans;
+      }
+    #endif
+    inline int simd_broadcast_i(int var, unsigned int src, int width) {
+      int ret;
+      int c;
+      c = ((SIMD_SIZE-width) << 8) | 0x1f;
+      asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff));
+      return ret;
+    }
+    inline float simd_broadcast_f(float var, unsigned int src, int width) {
+      float ret;
+      int c;
+      c = ((SIMD_SIZE-width) << 8) | 0x1f;
+      asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff));
+      return ret;
+    }
+    #ifdef _DOUBLE_DOUBLE
+      inline double simd_broadcast_d(double var, unsigned int src, int width) {
+        int c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        int x,y,x2,y2;
+        double ans;
+        asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+        asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(src), "r"(c), "r"(0xffffffff));
+        asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(src), "r"(c), "r"(0xffffffff));
+        asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+        return ans;
+      }
+    #endif
+  #endif
+#endif
+
+// -------------------------------------------------------------------------
+//                      OPENCL KERNEL MACROS - SUBGROUPS
+// -------------------------------------------------------------------------
+
+#ifdef USE_OPENCL_SUBGROUPS
+  #ifndef cl_intel_subgroups
+    #pragma OPENCL EXTENSION cl_khr_subgroups : enable
+  #endif
+  #define simdsync() sub_group_barrier(CLK_LOCAL_MEM_FENCE)
+  #define simd_size() get_max_sub_group_size()
+#else
+  #define simdsync()
+  #define simd_size() SIMD_SIZE
+#endif
+
+// -------------------------------------------------------------------------
+//                            END OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
 #endif

 // -------------------------------------------------------------------------
 //                  ARCHITECTURE INDEPENDENT DEFINITIONS
 // -------------------------------------------------------------------------

-#ifndef PPPM_MAX_SPLINE
-#define PPPM_MAX_SPLINE 8
-#endif
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
+#define acctyp2 double2
 #define acctyp4 double4
 #endif

@ -663,6 +305,7 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
+#define acctyp2 double2
 #define acctyp4 double4
 #endif

@ -671,6 +314,7 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
+#define acctyp2 float2
 #define acctyp4 float4
 #endif

@ -686,11 +330,9 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };

-#ifndef BLOCK_ELLIPSE
-#define BLOCK_ELLIPSE BLOCK_PAIR
-#endif
-
-// default to 32-bit smallint and other ints, 64-bit bigint: same as defined in src/lmptype.h
-#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && !defined(LAMMPS_SMALLBIG)
+// default to 32-bit smallint and other ints, 64-bit bigint:
+// same as defined in src/lmptype.h
+#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
+    !defined(LAMMPS_SMALLBIG)
 #define LAMMPS_SMALLBIG
 #endif